Source code for kindred.Parser

# -*- coding: utf-8 -*-

import kindred
from intervaltree import IntervalTree
from collections import defaultdict
import six

[docs]class Parser: """ Runs Spacy on corpus to get sentences and associated tokens :ivar model: Model for parsing (e.g. en/de/es/pt/fr/it/nl) :ivar nlp: The underlying Spacy language model to use for parsing """ _models = {}
[docs] def __init__(self,model='en_core_web_sm'): """ Create a Parser object that will use Spacy for parsing. It offers all the same languages that Spacy offers. Check out: Note that the language model needs to be downloaded first (e.g. python -m spacy download en) :param model: Name of an available Spacy language model for parsing (e.g. en/de/es/pt/fr/it/nl) :type model: str """ # We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily) import spacy self.model = model if not model in Parser._models: Parser._models[model] = spacy.load(model, disable=['ner']) self.nlp = Parser._models[model]
def _sentencesGenerator(self,text): if six.PY2 and isinstance(text,str): text = unicode(text) parsed = self.nlp(text) for sent in parsed.sents: yield sent
[docs] def parse(self,corpus): """ Parse the corpus. Each document will be split into sentences which are then tokenized and parsed for their dependency graph. All parsed information is stored within the corpus object. :param corpus: Corpus to parse :type corpus: kindred.Corpus """ assert isinstance(corpus,kindred.Corpus) # Ignore DeprecationWarning from SortedDict which is inside IntervalTree import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) for d in corpus.documents: entityIDsToEntities = { entity.entityID:entity for entity in d.entities } denotationTree = IntervalTree() entityTypeLookup = {} for e in d.entities: entityTypeLookup[e.entityID] = e.entityType for a,b in e.position: if b > a: denotationTree[a:b] = e.entityID for sentence in self._sentencesGenerator(d.text): tokens = [] for t in sentence: token = kindred.Token(t.text,t.lemma_,t.pos_,t.idx,t.idx+len(t.text)) tokens.append(token) sentenceStart = tokens[0].startPos sentenceEnd = tokens[-1].endPos sentenceTxt = d.text[sentenceStart:sentenceEnd] indexOffset = sentence[0].i dependencies = [] for t in sentence: depName = t.dep_ dep = (t.head.i-indexOffset,t.i-indexOffset,depName) dependencies.append(dep) entityIDsToTokenLocs = defaultdict(list) for i,t in enumerate(tokens): entitiesOverlappingWithToken = denotationTree[t.startPos:t.endPos] for interval in entitiesOverlappingWithToken: entityID = entityIDsToTokenLocs[entityID].append(i) sentence = kindred.Sentence(sentenceTxt, tokens, dependencies, d.sourceFilename) # Let's gather up the information about the "known" entities in the sentence for entityID,entityLocs in sorted(entityIDsToTokenLocs.items()): # Get the entity associated with this ID e = entityIDsToEntities[entityID] sentence.addEntityAnnotation(e,entityLocs) d.addSentence(sentence) corpus.parsed = True