Source code for kindred.Parser

# -*- coding: utf-8 -*-


import kindred
from intervaltree import IntervalTree
from collections import defaultdict
import six

[docs]class Parser:
	"""
	Runs Spacy on corpus to get sentences and associated tokens
	
	:ivar model: Model for parsing (e.g. en/de/es/pt/fr/it/nl)
	:ivar nlp: The underlying Spacy language model to use for parsing
	"""

	_models = {}
	
[docs]	def __init__(self,model='en_core_web_sm'):
		"""
		Create a Parser object that will use Spacy for parsing. It offers all the same languages that Spacy offers. Check out: https://spacy.io/usage/models. Note that the language model needs to be downloaded first (e.g. python -m spacy download en)
		
		:param model: Name of an available Spacy language model for parsing (e.g. en/de/es/pt/fr/it/nl)
		:type model: str
		"""

		# We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily)
		import spacy

		self.model = model

		if not model in Parser._models:
			Parser._models[model] = spacy.load(model, disable=['ner'])

		self.nlp = Parser._models[model]

	def _sentencesGenerator(self,text):
		if six.PY2 and isinstance(text,str):
			text = unicode(text)

		parsed = self.nlp(text)

		for sent in parsed.sents:
			yield sent

[docs]	def parse(self,corpus):
		"""
		Parse the corpus. Each document will be split into sentences which are then tokenized and parsed for their dependency graph. All parsed information is stored within the corpus object.
		
		:param corpus: Corpus to parse
		:type corpus: kindred.Corpus
		"""

		assert isinstance(corpus,kindred.Corpus)

		# Ignore DeprecationWarning from SortedDict which is inside IntervalTree
		import warnings
		warnings.filterwarnings("ignore", category=DeprecationWarning)

		for d in corpus.documents:
			entityIDsToEntities = { entity.entityID:entity for entity in d.entities }
		
			denotationTree = IntervalTree()
			entityTypeLookup = {}
			for e in d.entities:
				entityTypeLookup[e.entityID] = e.entityType
			
				for a,b in e.position:
					if b > a:
						denotationTree[a:b] = e.entityID
				
			for sentence in self._sentencesGenerator(d.text):
				tokens = []
				for t in sentence:
					token = kindred.Token(t.text,t.lemma_,t.pos_,t.idx,t.idx+len(t.text))
					tokens.append(token)

				sentenceStart = tokens[0].startPos
				sentenceEnd = tokens[-1].endPos
				sentenceTxt = d.text[sentenceStart:sentenceEnd]

				indexOffset = sentence[0].i
				dependencies = []
				for t in sentence:
					depName = t.dep_
					dep = (t.head.i-indexOffset,t.i-indexOffset,depName)
					dependencies.append(dep)

				entityIDsToTokenLocs = defaultdict(list)
				for i,t in enumerate(tokens):
					entitiesOverlappingWithToken = denotationTree[t.startPos:t.endPos]
					for interval in entitiesOverlappingWithToken:
						entityID = interval.data
						entityIDsToTokenLocs[entityID].append(i)

				sentence = kindred.Sentence(sentenceTxt, tokens, dependencies, d.sourceFilename)
				
				# Let's gather up the information about the "known" entities in the sentence
				for entityID,entityLocs in sorted(entityIDsToTokenLocs.items()):
					# Get the entity associated with this ID
					e = entityIDsToEntities[entityID]
					sentence.addEntityAnnotation(e,entityLocs)
					
				d.addSentence(sentence)

		corpus.parsed = True