import kindred
import random
[docs]class Corpus:
"""
Collection of text documents.
:ivar documents: List of :class:`kindred.Document`
:ivar parsed: Boolean of whether it has been parsed yet. A :class:`kindred.parser` can parse it.
"""
[docs] def __init__(self,text=None,loadFromSimpleTag=False):
"""
Create an empty corpus with no documents, or quickly load one with a single document using optional SimpleTag
:param text: Optional SimpleTag text to initalize a single document
:param loadFromSimpleTag: If text is provided, whether the text parameter is in the SimpleTag format and will extract entities and relations accordingly
:type text: String (with SimpleTag format XML)
:type loadFromSimpleTag: bool
"""
self.documents = []
if not text is None:
doc = kindred.Document(text,loadFromSimpleTag=loadFromSimpleTag)
self.addDocument(doc)
self.parsed = False
[docs] def addDocument(self,doc):
"""
Add a single document to the corpus
:param doc: Document to add
:type doc: kindred.Document
"""
assert isinstance(doc,kindred.Document)
self.documents.append(doc)
[docs] def clone(self):
"""
Clone the corpus
:return: Clone of the corpus
:rtype: kindred.Corpus
"""
cloned = Corpus()
for doc in self.documents:
cloned.addDocument(doc.clone())
return cloned
[docs] def removeEntities(self):
"""
Remove all entities in this corpus
"""
for doc in self.documents:
doc.removeEntities()
[docs] def getRelations(self):
"""
Get all relations in this corpus
:return: List of relations
:rtype: list
"""
relations = []
for doc in self.documents:
relations += doc.relations
return relations
[docs] def removeRelations(self):
"""
Remove all relations in this corpus
"""
for doc in self.documents:
doc.removeRelations()
[docs] def split(self,trainFraction):
"""
Randomly split the corpus into two corpus for use as a training and test set
:param trainFraction: Fraction of documents to use in training set
:type trainFraction: float
:return: Tuple of training and test corpus
:rtype: (kindred.Corpus,kindred.Corpus)
"""
assert isinstance(trainFraction,float)
assert trainFraction > 0.0 and trainFraction < 1.0
trainIndices = random.sample(range(len(self.documents)),int(round(trainFraction*len(self.documents))))
trainIndices = set(trainIndices)
trainCorpus,testCorpus = kindred.Corpus(),kindred.Corpus()
for i,doc in enumerate(self.documents):
if i in trainIndices:
trainCorpus.addDocument(doc)
else:
testCorpus.addDocument(doc)
return trainCorpus,testCorpus
[docs] def nfold_split(self,folds):
"""
Method for splitting up the corpus multiple times and is used for an n-fold cross validation approach (as a generator). Each iteration, the training and test set for that fold are provided.
:param folds: Number of folds to create
:type folds: int
:return: Tuple of training and test corpus (for iterations=folds)
:rtype: (kindred.Corpus,kindred.Corpus)
"""
assert isinstance(folds,int)
assert folds > 0
indices = list(range(len(self.documents)))
random.shuffle(indices)
chunkSize = int(len(self.documents)/float(folds))
indexChunks = [ indices[i:i+chunkSize] for i in range(0,len(self.documents),chunkSize) ]
for f in range(folds):
trainCorpus,testCorpus = kindred.Corpus(),kindred.Corpus()
for i,indexChunk in enumerate(indexChunks):
for j in indexChunk:
if i==f:
testCorpus.addDocument(self.documents[j])
else:
trainCorpus.addDocument(self.documents[j])
yield trainCorpus,testCorpus
[docs] def splitIntoSentences(self):
"""
Create a new corpus with one document for each sentence in this corpus.
:return: Corpus with one document per sentence
:rtype: kindred.Corpus
"""
assert self.parsed == True, "Corpus must be parsed before it can be split into sentences"
sentenceCorpus = kindred.Corpus()
for doc in self.documents:
tempCorpus = doc.splitIntoSentences()
sentenceCorpus.documents += tempCorpus.documents
sentenceCorpus.parsed = True
return sentenceCorpus