Source code for kindred.CandidateBuilder


from collections import defaultdict
import itertools

import kindred

[docs]class CandidateBuilder: """ Generates set of all possible relations in corpus. :ivar entityCount: Number of entities in each relation (default=2) :ivar acceptedEntityTypes: Tuples of entities that candidate relations must match. Each entity should be the same length as entityCount. None will match all candidate relations. """
[docs] def __init__(self,entityCount=2,acceptedEntityTypes=None): """ Constructor :param entityCount: Number of entities in each relation (default=2) :param acceptedEntityTypes: Tuples of entities that candidate relations must match. Each entity should be the same length as entityCount. None will match all candidate relations. :type entityCount: int :type acceptedEntityTypes: list of tuples """ assert isinstance(entityCount,int) assert entityCount >= 2 self.entityCount = entityCount assert acceptedEntityTypes is None or isinstance(acceptedEntityTypes,list) if acceptedEntityTypes is None: self.acceptedEntityTypes = None else: for acceptedEntityType in acceptedEntityTypes: assert isinstance(acceptedEntityType,tuple) assert len(acceptedEntityType) == entityCount self.acceptedEntityTypes = set(acceptedEntityTypes)
[docs] def build(self,corpus): """ Creates the set of all possible relations that exist within the given corpus. Each relation will be contained within a single sentence. :param corpus: Corpus of text with which to build relation candidates :type corpus: kindred.Corpus :return: List of candidate relations matching entityCount and acceptedEntityTypes :rtype: List of kindred.Relation """ assert isinstance(corpus,kindred.Corpus) assert corpus.parsed, "Corpus must have already been parsed" candidates = [] for doc in corpus.documents: existingRelationsAndArgNames = defaultdict(list) for r in doc.relations: assert isinstance(r,kindred.Relation) entities = tuple(r.entities) existingRelationsAndArgNames[entities].append((r.relationType,tuple(r.argNames))) for sentence in doc.sentences: entitiesInSentence = [ entity for entity,tokenIndices in sentence.entityAnnotations ] for entitiesInRelation in itertools.permutations(entitiesInSentence, self.entityCount): typesInRelation = tuple([ e.entityType for e in entitiesInRelation ]) if not self.acceptedEntityTypes is None and not typesInRelation in self.acceptedEntityTypes: # Relation doesn't contain the right entity types (so skip it) continue knownTypesAndArgNames = list(set(existingRelationsAndArgNames[entitiesInRelation])) knownTypesAndArgNames = [ (relationType,list(argNames)) for relationType,argNames in knownTypesAndArgNames ] candidateRelation = kindred.CandidateRelation(entities=list(entitiesInRelation),knownTypesAndArgNames=knownTypesAndArgNames,sentence=sentence) candidates.append(candidateRelation) return candidates