Source code for kindred.Corpus

import kindred
import random

[docs]class Corpus:
	"""
	Collection of text documents.

	:ivar documents: List of :class:`kindred.Document`
	:ivar parsed: Boolean of whether it has been parsed yet. A :class:`kindred.parser` can parse it.
	"""
	
[docs]	def __init__(self,text=None,loadFromSimpleTag=False):
		"""
		Create an empty corpus with no documents, or quickly load one with a single document using optional SimpleTag
		
		:param text: Optional SimpleTag text to initalize a single document
		:param loadFromSimpleTag: If text is provided, whether the text parameter is in the SimpleTag format and will extract entities and relations accordingly
		:type text: String (with SimpleTag format XML)
		:type loadFromSimpleTag: bool
		"""

		self.documents = []
		if not text is None:
			doc = kindred.Document(text,loadFromSimpleTag=loadFromSimpleTag)
			self.addDocument(doc)

		self.parsed = False

[docs]	def addDocument(self,doc):
		"""
		Add a single document to the corpus
		
		:param doc: Document to add
		:type doc: kindred.Document
		"""

		assert isinstance(doc,kindred.Document)
		self.documents.append(doc)

[docs]	def clone(self):
		"""
		Clone the corpus
		
		:return: Clone of the corpus
		:rtype: kindred.Corpus
		"""

		cloned = Corpus()
		for doc in self.documents:
			cloned.addDocument(doc.clone())
		return cloned

[docs]	def removeEntities(self):
		"""
		Remove all entities in this corpus
		"""

		for doc in self.documents:
			doc.removeEntities()

[docs]	def getRelations(self):
		"""
		Get all relations in this corpus
		
		:return: List of relations
		:rtype: list
		"""

		relations = []
		for doc in self.documents:
			relations += doc.relations
		return relations

[docs]	def removeRelations(self):
		"""
		Remove all relations in this corpus
		"""

		for doc in self.documents:
			doc.removeRelations()

[docs]	def split(self,trainFraction):
		"""
		Randomly split the corpus into two corpus for use as a training and test set

		:param trainFraction: Fraction of documents to use in training set
		:type trainFraction: float
		:return: Tuple of training and test corpus
		:rtype: (kindred.Corpus,kindred.Corpus)
		"""
		assert isinstance(trainFraction,float)
		assert trainFraction > 0.0 and trainFraction < 1.0
		trainIndices = random.sample(range(len(self.documents)),int(round(trainFraction*len(self.documents))))
		trainIndices = set(trainIndices)

		trainCorpus,testCorpus = kindred.Corpus(),kindred.Corpus()
		for i,doc in enumerate(self.documents):
			if i in trainIndices:
				trainCorpus.addDocument(doc)
			else:
				testCorpus.addDocument(doc)

		return trainCorpus,testCorpus

[docs]	def nfold_split(self,folds):
		"""
		Method for splitting up the corpus multiple times and is used for an n-fold cross validation approach (as a generator). Each iteration, the training and test set for that fold are provided.

		:param folds: Number of folds to create
		:type folds: int
		:return: Tuple of training and test corpus (for iterations=folds)
		:rtype: (kindred.Corpus,kindred.Corpus)
		"""
		assert isinstance(folds,int)
		assert folds > 0

		indices = list(range(len(self.documents)))
		random.shuffle(indices)

		chunkSize = int(len(self.documents)/float(folds))
		indexChunks = [ indices[i:i+chunkSize] for i in range(0,len(self.documents),chunkSize) ]

		for f in range(folds):
			trainCorpus,testCorpus = kindred.Corpus(),kindred.Corpus()
			for i,indexChunk in enumerate(indexChunks):
				for j in indexChunk:
					if i==f:
						testCorpus.addDocument(self.documents[j])
					else:
						trainCorpus.addDocument(self.documents[j])
			yield trainCorpus,testCorpus

[docs]	def splitIntoSentences(self):
		"""
		Create a new corpus with one document for each sentence in this corpus.

		:return: Corpus with one document per sentence
		:rtype: kindred.Corpus
		"""
		assert self.parsed == True, "Corpus must be parsed before it can be split into sentences"

		sentenceCorpus = kindred.Corpus()
		for doc in self.documents:
			tempCorpus = doc.splitIntoSentences()
			sentenceCorpus.documents += tempCorpus.documents
		sentenceCorpus.parsed = True

		return sentenceCorpus