Source code for kindred.Document

import kindred
from collections import OrderedDict

[docs]class Document:
	"""
	Span of text with associated tagged entities and relations between entities.

	:ivar text: Text in document (plain text or SimpleTag)
	:ivar entities: Entities in document
	:ivar relations: Relations in document
	:ivar sourceFilename: Filename that this document came from
	:ivar metadata: IDs and other information associated with the source (e.g. PMID)
	:ivar sentences: List of sentences (:class:`kindred.Sentence`) if the document has been parsed
	"""
	
[docs]	def __init__(self,text,entities=None,relations=None,sourceFilename=None,metadata=None,loadFromSimpleTag=False):
		"""
		Constructor for a Document that can take text using the SimpleTag XML format, or a set of Entities and Relations with associated text.
		
		:param text: Text in document (plain text or SimpleTag)
		:param entities: Entities in document
		:param relations: Relations in document
		:param sourceFilename: Filename that this document came from
		:param metadata: IDs and other information associated with the source (e.g. PMID)
		:param loadFromSimpleTag: Assumes the text parameter is in the SimpleTag format and will extract entities and relations accordingly
		:type text: str
		:type entities: list of kindred.Entity
		:type relations: list of kindred.Relation
		:type sourceFilename: str
		:type metadata: dict
		:type loadFromSimpleTag: bool
		"""

		self.sourceFilename = sourceFilename

		if metadata is None:
			self.metadata = {}
		else:
			self.metadata = metadata

		if loadFromSimpleTag:
			assert entities is None and relations is None, 'Entities and relations will be extracted from SimpleTag. They cannot also be passed in as parameters'

			docToCopy = kindred.loadFunctions.parseSimpleTag(text)
			assert isinstance(docToCopy,kindred.Document)
			self.text = docToCopy.text
			self.entities = docToCopy.entities
			self.relations = docToCopy.relations
		else:
			self.text = text
			
			if entities is None:
				self.entities = []
			else:
				assert isinstance(entities,list)
				for e in entities:
					assert isinstance(e,kindred.Entity)
				self.entities = entities
			
			if relations is None:
				self.relations = []
			else:
				assert isinstance(relations,list)
				for r in relations:
					assert isinstance(r,kindred.Relation)
				self.relations = relations

		self.sentences = []
		
	def __repr__(self):
		"""
		String representation of Document
		
		:return: string representation
		:rtype: str
		"""
		return self.__str__()
	
	def __str__(self):
		"""
		String representation of Document
		
		:return: string representation
		:rtype: str
		"""

		return u"<Document %s %s %s>"  % (self.text,str(self.entities),str(self.relations))
	
[docs]	def addEntity(self,entity):
		"""
		Add an entity to this document. If document has been parsed, it will add the entity into the sentence structure and associated with tokens.
		
		:param entity: Entity to add
		:type entity: kindred.Entity
		"""

		self.entities.append(entity)

		if self.sentences:
			for sentence in self.sentences:
				overlappingTokens = [ i for i,t in enumerate(sentence.tokens) if any (not (t.endPos <= eStart or t.startPos >= eEnd) for eStart,eEnd in entity.position ) ]
				if overlappingTokens:
					sentence.addEntityAnnotation(entity,overlappingTokens)

[docs]	def addRelation(self,relation):
		"""
		Add a relation to this document
		
		:param relation: Relation to add
		:type relation: kindred.Relation
		"""

		self.relations.append(relation)

[docs]	def addSentence(self,sentence):
		"""
		Add a sentence to this document
		
		:param sentence: Sentence to add
		:type sentence: kindred.Sentence
		"""

		assert isinstance(sentence,kindred.Sentence)
		self.sentences.append(sentence)
		
[docs]	def clone(self):
		"""
		Clones the document
		
		:return: Clone of the document
		:rtype: kindred.Document
		"""

		cloned = Document(self.text,entities=self.entities,relations=self.relations,sourceFilename=self.sourceFilename)
		return cloned

[docs]	def removeEntities(self):
		"""
		Remove all entities in this document
		"""
		self.entities = []
	
[docs]	def removeRelations(self):
		"""
		Remove all relations in this document
		"""
		self.relations = []

[docs]	def splitIntoSentences(self):
		"""
		Create a new corpus with one document for each sentence in this document.

		:return: Corpus with one document per sentence
		:rtype: kindred.Corpus
		"""

		sentenceCorpus = kindred.Corpus()
		
		for sentence in self.sentences:
			sentenceStart = sentence.tokens[0].startPos
			
			entitiesInSentence = [ entity for entity,tokenIndices in sentence.entityAnnotations ]

			entityMap = OrderedDict()
			for e in entitiesInSentence:
				startPos,endPos = e.position[0]
				newPosition = [ (startPos-sentenceStart, endPos-sentenceStart) ]
				newE = kindred.Entity(e.entityType,e.text,newPosition,e.sourceEntityID,e.externalID)
				entityMap[e] = newE

			relationsInSentence = [ r for r in self.relations if all( e in entitiesInSentence for e in r.entities ) ]
			newRelationsInSentence = []
			for r in relationsInSentence:
				newEntitiesInRelation = [ entityMap[e] for e in r.entities ]
				newRelation = kindred.Relation(r.relationType,newEntitiesInRelation,r.argNames,r.probability)
				newRelationsInSentence.append(newRelation)

			newEntitiesInSentence = list(entityMap.values())
			doc = kindred.Document(sentence.text.rstrip('\n'),newEntitiesInSentence,newRelationsInSentence)

			newTokens = [ kindred.Token(t.word,t.lemma,t.partofspeech,t.startPos-sentenceStart,t.endPos-sentenceStart) for t in sentence.tokens ]

			newSentence = kindred.Sentence(sentence.text,newTokens,sentence.dependencies,sentence.sourceFilename)
			newEntityAnnotations = [ (entityMap[e],tokenIndices) for e,tokenIndices in sentence.entityAnnotations ]
			newSentence.entityAnnotations = newEntityAnnotations
			doc.sentences = [newSentence]

			if len(doc.text.strip()) > 0:
				sentenceCorpus.addDocument(doc)

		return sentenceCorpus