import sys
import itertools
import kindred
import pickle
import argparse
import codecs
import time
import re
import string
from collections import defaultdict,Counter
import json
import six
import os
def acronymMatch(words,pos,currentAcronym,atStart,subpos=None):
if len(currentAcronym) == 0:
if not (subpos is None): # Can't finish acronym mid-word
return []
else:
return [pos+1]
curWord = words[pos].lower()
wordSplit = curWord.split('-')
curLetter = currentAcronym[-1]
moves = []
if subpos is None:
if atStart and curLetter == 's' and curWord[-1] == 's':
# Possible plural
moves.append( (words,pos,currentAcronym[:-1],False) )
if curLetter == curWord[0]:
moves.append( (words,pos-1,currentAcronym[:-1],False) )
if curWord == '-':
moves.append( (words,pos-1,currentAcronym,False) )
if len(wordSplit) > 1:
if subpos is None:
subpos = len(wordSplit)-1
if len(wordSplit[subpos]) > 0 and curLetter == wordSplit[subpos][0]:
if subpos == 0:
moves.append( (words,pos-1,currentAcronym[:-1],False) )
else:
moves.append( (words,pos,currentAcronym[:-1],False,subpos-1) )
possibleStarts = []
for move in moves:
possibleStarts += acronymMatch(*move)
return possibleStarts
def acronymDetection(words):
LRBs = [i for i, x in enumerate(words) if x == u'(']
RRBs = [i for i, x in enumerate(words) if x == u')']
acronyms = []
for i,j in itertools.product(LRBs,RRBs):
if j-i == 2:
acronymLoc = i+1
possibleAcronym = words[acronymLoc]
possibleStarts = acronymMatch(words,i-1,possibleAcronym.lower(),True)
if len(possibleStarts) > 0:
start = min(possibleStarts)
end = i
acronyms.append((start,end,acronymLoc))
return acronyms
def mergeWordsForFusionDetection(words):
prevWord = ""
mergedWords = []
start = 0
mergeChars = ['-','/',':']
for i,w in enumerate(words):
if w in mergeChars:
prevWord += w
elif len(prevWord) > 0 and prevWord[-1] in mergeChars:
prevWord += w
else:
if prevWord:
mergedWords.append((start,i-1,prevWord))
prevWord = w
start = i
if prevWord:
mergedWords.append((start,len(words)-1,prevWord))
return mergedWords
def fusionGeneDetection(words, lookupDict):
termtypesAndids,terms,locs = [],[],[]
origWords = list(words)
words = [ w.lower() for w in words ]
mergedWords = mergeWordsForFusionDetection(words)
for start,end,word in mergedWords:
split = re.split("[-/:]",word)
fusionCount = len(split)
if fusionCount == 1:
continue
allGenes = True
geneIDs = []
lookupIDCounter = Counter()
for s in split:
key = s
if key in lookupDict:
isGene = False
for entityType,entityID in lookupDict[key]:
if entityType == 'gene':
for tmpID in entityID.split(';'):
lookupIDCounter[tmpID] += 1
geneIDs.append(entityID)
isGene = True
break
if not isGene:
allGenes = False
break
else:
allGenes = False
break
# We're going to check if there are any lookup IDs shared among all the "fusion" terms
# Hence this may not actually be a fusion, but just using multiple names of a gene
# e.g. HER2/neu
completeLookupIDs = [ id for id,count in lookupIDCounter.items() if count == fusionCount ]
if len(completeLookupIDs) > 0:
termtypesAndids.append([('gene',';'.join(completeLookupIDs))])
terms.append(tuple(origWords[start:end+1]))
locs.append((start,end+1))
elif allGenes: # All the terms look like genes (and different genes), so we're going to mark this as a fusion (or combo)
#geneTxt = ",".join(map(str,geneIDs))
geneIDs = [ geneID.replace(';','&') for geneID in geneIDs ]
termtypesAndids.append([('gene','combo|' + '|'.join(geneIDs))])
terms.append(tuple(origWords[start:end+1]))
locs.append((start,end+1))
return locs,terms,termtypesAndids
def getTermIDsAndLocations(sentence, lookupDict):
termtypesAndids,terms,locs = [],[],[]
# Lowercase all the tokens
#np = [ unicodeLower(w) for w in np ]
#orignp = np
np = [ t.word.lower() for t in sentence.tokens ]
blank = "".join( " " for _ in sentence.text )
tempSentence = sentence.text.lower()
sentenceStart = sentence.tokens[0].startPos
# The length of each search string will decrease from the full length
# of the text down to 1
for l in reversed(range(1, len(sentence.tokens)+1)):
# We move the search window through the text
for i in range(len(np)-l+1):
# Extract that window of text
#s = tuple(np[i:i+l])
startPos = sentence.tokens[i].startPos - sentenceStart
endPos = sentence.tokens[i+l-1].endPos - sentenceStart
s = tempSentence[startPos:endPos]
# Search for it in the dictionary
if s in lookupDict:
# If found, save the ID(s) in the dictionary
termtypesAndids.append(lookupDict[s])
terms.append(tuple(np[i:i+l]))
locs.append((i,i+l))
# And blank it out
#np[i:i+l] = [ "" for _ in range(l) ]
tempSentence = tempSentence[:startPos] + blank[startPos:endPos] + tempSentence[endPos:]
# Then return the found term IDs
return locs,terms,termtypesAndids
def startsWithButNotAll(s,search):
return s.startswith(search) and len(s) > len(search)
def cleanupVariant(variant):
variant = variant.upper().replace('P.','')
aminoAcidInfo = [('ALA','A'),('ARG','R'),('ASN','N'),('ASP','D'),('CYS','C'),('GLU','E'),('GLN','Q'),('GLY','G'),('HIS','H'),('ILE','I'),('LEU','L'),('LYS','K'),('MET','M'),('PHE','F'),('PRO','P'),('SER','S'),('THR','T'),('TRP','W'),('TYR','Y'),('VAL','V')]
for longA,shortA in aminoAcidInfo:
variant = variant.replace(longA,shortA)
return variant
[docs]class EntityRecognizer:
"""
Annotates entities in a Corpus using an exact-dictionary matching scheme with additional heuristics. These heuristics include detecthing fusion gene mentions, microRNA, identifying acronyms to reduce ambiguity, identifying variants and more. All the options are parameters for the constructor of this class.
:ivar lookup: Used for the dictionary matching. A dictionary of terms (tuple of parsed words) to a list of (entityType,externalID).
:ivar detectFusionGenes: Whether it will try to identify fusion gene terms (e.g. BCR-ABL1). Lookup must contain terms of type 'gene'
:ivar detectMicroRNA: Whether it will identify microRNA terms (added as 'gene' entities)
:ivar acronymDetectionForAmbiguity: Whether it will try to identify acronyms and use this to deal with ambiguity (by removing incorrect matches to acronyms or the longer terms)
:ivar mergeTerms: Whether it will merge neighbouring terms that refer to the same external entity (e.g. HER2/neu as one term instead of two)
:ivar detectVariants: Whether it will identify a variant (e.g. V600E) and create an entity of type 'variant'
:ivar variantStopwords: Variant terms to be ignored (e.g. S100P) if detectVariants is used
:ivar detectPolymorphisms: Whether it will identify a SNP (using a dbSNP ID) and create an entity of type 'variant'
:ivar removePathways: Whether it will remove genes that are actually naming a signalling pathway (e.g. MTOR pathway)
"""
[docs] def __init__(self,lookup,detectFusionGenes=False,detectMicroRNA=False,acronymDetectionForAmbiguity=False,mergeTerms=False,detectVariants=False,variantStopwords=None,detectPolymorphisms=False,removePathways=False):
"""
Create an EntityRecognizer and provide the lookup table for terms and additional flags for what to identify in text
:param lookup: A dictionary of terms (tuple of parsed words) to a list of (entityType,externalID).
:param detectFusionGenes: Whether to try to identify fusion gene terms (e.g. BCR-ABL1). Lookup must contain terms of type 'gene'
:param detectMicroRNA: Whether to identify microRNA terms (added as 'gene' entities)
:param acronymDetectionForAmbiguity: Whether to try to identify acronyms and use this to deal with ambiguity (by removing incorrect matches to acronyms or the longer terms)
:param mergeTerms: Whether to merge neighbouring terms that refer to the same external entity (e.g. HER2/neu as one term instead of two)
:param detectVariants: Whether to identify a variant (e.g. V600E) and create an entity of type 'variant'
:param variantStopwords: Variant terms to be ignored (e.g. S100P) if detectVariants is used
:param detectPolymorphisms: Whether to identify a SNP (using a dbSNP ID) and create an entity of type 'variant'
:param removePathways: Remove genes that are actually naming a signalling pathway (e.g. MTOR pathway)
:type lookup: dict
:type detectFusionGenes: bool
:type detectMicroRNA: bool
:type acronymDetectionForAmbiguity: bool
:type mergeTerms: bool
:type detectVariants: bool
:type variantStopwords: list
:type detectPolymorphisms: bool
:type removePathways: bool
"""
if variantStopwords is None:
variantStopwords = []
assert isinstance(lookup,dict)
for termsmatch,typeAndIDs in lookup.items():
assert isinstance(termsmatch,six.string_types), "Lookup key must be a tuple of strings"
assert isinstance(typeAndIDs,set), "Lookup value must be a list of (entityType,externalID)"
assert len(typeAndIDs)>0, "Lookup value must be a list of (entityType,externalID)"
for typeAndID in typeAndIDs:
assert isinstance(typeAndID,tuple),"Lookup value must be a list of (entityType,externalID)"
assert len(typeAndID)==2, "Lookup value must be a list of (entityType,externalID)"
assert isinstance(detectFusionGenes,bool)
assert isinstance(detectMicroRNA,bool)
assert isinstance(acronymDetectionForAmbiguity,bool)
assert isinstance(mergeTerms,bool)
assert isinstance(detectVariants,bool)
assert isinstance(detectPolymorphisms,bool)
assert isinstance(variantStopwords,list)
for variantStopword in variantStopwords:
assert isinstance(variantStopword,six.string_types), "variantStopwords should be a list of strings"
self.lookup = lookup
self.detectFusionGenes = detectFusionGenes
self.detectMicroRNA = detectMicroRNA
self.acronymDetectionForAmbiguity = acronymDetectionForAmbiguity
self.mergeTerms = mergeTerms
self.detectVariants = detectVariants
self.variantStopwords = set([vs.lower() for vs in variantStopwords])
self.detectPolymorphisms = detectPolymorphisms
self.removePathways = removePathways
self.variantRegex1 = re.compile(r'\b[ACDEFGHIKLMNPQRSTVWY][1-9][0-9]*[ACDEFGHIKLMNPQRSTVWY]\b')
self.variantRegex2 = re.compile(r'\b(p\.)?((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))[1-9][0-9]*((Ala)|(Arg)|(Asn)|(Asp)|(Cys)|(Glu)|(Gln)|(Gly)|(His)|(Ile)|(Leu)|(Lys)|(Met)|(Phe)|(Pro)|(Ser)|(Thr)|(Trp)|(Tyr)|(Val))\b', re.IGNORECASE)
self.mirnaRegex = re.compile(r'(mir-|hsa-mir|microrna-|mir)(?P<id>\d+\w*(-\w+)*)', re.IGNORECASE)
def _processWords(self, sentence):
locs,terms,termtypesAndids = getTermIDsAndLocations(sentence,self.lookup)
words = [ t.word for t in sentence.tokens ]
# Index the start and ends locations of tokens for lookup
token_starts = { t.startPos:i for i,t in enumerate(sentence.tokens) }
token_ends = { t.endPos:i for i,t in enumerate(sentence.tokens) }
if self.detectVariants:
snvMatches = list(self.variantRegex1.finditer(sentence.text)) + list(self.variantRegex2.finditer(sentence.text))
for match in snvMatches:
snvText = match.group()
start,end = match.span()
if start in token_starts and end in token_ends and not snvText.lower() in self.variantStopwords:
cleaned = cleanupVariant(snvText)
potentialLocs = (token_starts[start],token_ends[end]+1)
if not potentialLocs in locs:
termtypesAndids.append([('variant',"substitution|%s"%cleaned)])
terms.append((snvText,))
locs.append(potentialLocs)
if self.detectPolymorphisms:
polymorphismRegex1 = r'^rs[1-9][0-9]*$'
polyMatches = [ not (re.match(polymorphismRegex1,w) is None) for w in words ]
for i,(w,polyMatch) in enumerate(zip(words,polyMatches)):
if polyMatch:
potentialLocs = (i,i+1)
if not potentialLocs in locs:
termtypesAndids.append([('variant','dbsnp|%s'%w)])
terms.append((w,))
locs.append(potentialLocs)
if self.detectMicroRNA:
mirnaMatches = self.mirnaRegex.finditer(sentence.text)
for match in mirnaMatches:
mirText = match.group()
start,end = match.span()
if start in token_starts and end in token_ends:
cleaned = 'mir-' + match.group('id')
potentialLocs = (token_starts[start],token_ends[end]+1)
if not potentialLocs in locs:
termtypesAndids.append([('gene',"mirna|%s"%cleaned)])
terms.append((mirText,))
locs.append(potentialLocs)
toRemove = []
if self.detectFusionGenes:
fusionLocs,fusionTerms,fusionTermtypesAndids = fusionGeneDetection(words,self.lookup)
for floc,fterm,ftermtypesAndid in zip(fusionLocs,fusionTerms,fusionTermtypesAndids):
if not floc in locs:
# Check for which entities to remove that are inside this fusion term
fstart,fend = floc
for tstart,tend in locs:
if fstart <= tstart and tend <= fend:
toRemove.append((tstart,tend))
locs.append(floc)
terms.append(fterm)
termtypesAndids.append(ftermtypesAndid)
filtered = zip(locs,terms,termtypesAndids)
filtered = [ (l,t,ti) for l,t,ti in filtered if not l in toRemove ]
filtered = sorted(filtered)
if self.mergeTerms:
# We'll attempt to merge terms (i.e. if a gene is referred to using two acronyms together)
# Example: Hepatocellular carcinoma (HCC) or HER2/ Neu or INK4B P15
# First we'll go through an expand terms out into brackets
filteredWithBrackets = []
for (startA,endA),termsA,termTypesAndIDsA in filtered:
termInBrackets = startA > 0 and endA < len(words) and words[startA-1] == '(' and words[endA] == ')'
if termInBrackets:
startA -= 1
endA += 1
filteredWithBrackets.append( ((startA,endA),termsA,termTypesAndIDsA) )
filteredWithBrackets = sorted(filteredWithBrackets)
# Next we go through and create groups of terms that should be merged
indexGroups, curGroup = [], []
curGroup, prevStart, prevEnd, prevIDs = None, None, None, None
for index,((curStart,curEnd),curTerms,curTermTypesAndIDs) in enumerate(filteredWithBrackets):
curIDs = set( (termType,termID) for termType, termIDs in curTermTypesAndIDs for termID in termIDs.split(';') )
shouldMergeWithPrev = False
if not prevStart is None:
termsAreNeighbouring = (curStart == prevEnd or (curStart == (prevEnd+1) and words[prevEnd] in ['/','-']))
if termsAreNeighbouring:
idsIntersection = prevIDs.intersection(curIDs)
idsShared = (len(idsIntersection) > 0)
if idsShared:
curIDs = idsIntersection
shouldMergeWithPrev = True
prevStart, prevEnd, prevIDs = curStart, curEnd, curIDs
if shouldMergeWithPrev:
curGroup.append(index)
else:
if curGroup:
indexGroups.append(curGroup)
curGroup = [index]
# Remember to add any final group to the list of groups
if curGroup:
indexGroups.append(curGroup)
# And now we do merging where appropriate
mergedFiltered = []
for indexGroup in indexGroups:
if len(indexGroup) == 1:
# No merging required
index = indexGroup[0]
mergedFiltered.append(filtered[index])
else:
# Merging required
idsIntersection = None
for index in indexGroup:
(curStart,curEnd),curTerms,curTermTypesAndIDs = filteredWithBrackets[index]
ids = set( (termType,termID) for termType, termIDs in curTermTypesAndIDs for termID in termIDs.split(';') )
if idsIntersection is None:
idsIntersection = ids
else:
idsIntersection = idsIntersection.intersection(ids)
# Double check that there the IDs of the merging terms do actually overlap
assert len(idsIntersection) > 0
groupedByType = defaultdict(list)
for termType,termID in idsIntersection:
groupedByType[termType].append(termID)
newTermTypesAndIDs = [ (termType,";".join(sorted(termIDs))) for termType,termIDs in groupedByType.items() ]
newStart = filteredWithBrackets[indexGroup[0]][0][0]
newEnd = filteredWithBrackets[indexGroup[-1]][0][1]
newTerms = tuple(words[newStart:newEnd])
mergedFiltered.append( ((newStart,newEnd),newTerms,newTermTypesAndIDs) )
filtered = sorted(mergedFiltered)
if self.acronymDetectionForAmbiguity:
# And we'll check to see if there are any obvious acronyms
locsToRemove = set()
acronyms = acronymDetection(words)
for (wordsStart,wordsEnd,acronymLoc) in acronyms:
wordIsTerm = (wordsStart,wordsEnd) in locs
acronymIsTerm = (acronymLoc,acronymLoc+1) in locs
if wordIsTerm and acronymIsTerm:
# Remove the acronym
locsToRemove.add((acronymLoc,acronymLoc+1))
elif acronymIsTerm:
# Remove any terms that contain part of the spelt out word
newLocsToRemove = [ (i,j) for i in range(wordsStart,wordsEnd) for j in range(i,wordsEnd+1) ]
locsToRemove.update(newLocsToRemove)
# Now we have to remove the terms marked for deletion in the previous section
filtered = [ (locs,terms,termtypesAndids) for locs,terms,termtypesAndids in filtered if not locs in locsToRemove]
filtered = sorted(filtered)
if self.removePathways:
forbiddenPathwayWords = set(['pathway','pathways','signaling','signalling','cascade'])
filtered2 = []
for locs,terms,termtypesAndids in filtered:
nextTokenIndex = locs[1]
nextTokenIsForbiddenWord = nextTokenIndex < len(words) and words[nextTokenIndex].lower() in forbiddenPathwayWords
if nextTokenIsForbiddenWord:
termtypesAndids = [ (termtype,termid) for termtype,termid in termtypesAndids if not termtype == 'gene' ]
if len(termtypesAndids) > 0:
filtered2.append((locs,terms,termtypesAndids))
filtered = filtered2
return filtered
[docs] def annotate(self,corpus):
"""
Annotate a parsed corpus with the wordlist lookup and other entity types
:param corpus: Corpus to annotate
:type corpus: kindred.Corpus
"""
assert corpus.parsed == True, "Corpus must already be parsed before entity recognition"
for doc in corpus.documents:
entityCount = len(doc.entities)
for sentence in doc.sentences:
extractedTermData = self._processWords(sentence)
for locs,terms,termtypesAndids in extractedTermData:
startToken = locs[0]
endToken = locs[1]
startPos = sentence.tokens[startToken].startPos
endPos = sentence.tokens[endToken-1].endPos
text = doc.text[startPos:endPos]
loc = list(range(startToken,endToken))
for entityType,externalID in termtypesAndids:
sourceEntityID = "T%d" % (entityCount+1)
e = kindred.Entity(entityType,text,[(startPos,endPos)],externalID=externalID,sourceEntityID=sourceEntityID)
#doc.addEntity(e)
doc.entities.append(e)
sentence.addEntityAnnotation(e,loc)
entityCount += 1
[docs] @staticmethod
def loadWordlists(entityTypesWithFilenames, idColumn=0, termsColumn=1, columnSeparator='\t', termSeparator='|'):
"""
Load a wordlist from multiple files. By default, each file should be a tab-delimited file with the first column is the ID and the second column containing all the terms separated by '|'. This can be modified by the parameters.
As each term is parsed, this can take a long time. It is recommended to run this one time and save the output as a Python pickle file and load in.
:param entityTypesWithFilenames: Dictionary of entityType => filename
:param idColumn: The column containing the ID for the term (starts from 0)
:param termsColumn: The column containing the list of terms (starts from 0)
:param columnSeparator: The column separator for the file (default is a tab)
:param termSeparator: The separator for the list of terms (default is a '|')
:type entityTypesWithFilenames: dict
:type idColumn: int
:type termsColumn: int
:type columnSeparator: str
:type termSeparator: str
:return: Dictionary of lookup values
:rtype: dict
"""
errorMsg = 'entityTypesWithFilenames should be a dictionary with pairs of {entityType: filename} where both are strings and the filename points to a file that exists'
assert isinstance(entityTypesWithFilenames,dict), errorMsg
for entityType,filename in entityTypesWithFilenames.items():
assert isinstance(entityType,six.string_types), errorMsg
assert isinstance(filename,six.string_types), errorMsg
assert os.path.isfile(filename), "%s does not exist" % filename
assert isinstance(idColumn,int)
assert isinstance(termsColumn,int)
assert isinstance(columnSeparator,str)
assert isinstance(termSeparator,str)
requiredColumns = max(idColumn,termsColumn)+1
lookup = defaultdict(set)
for entityType,filename in entityTypesWithFilenames.items():
with codecs.open(filename,'r','utf-8') as f:
tempLookup = defaultdict(set)
for lineno,line in enumerate(f):
split = line.strip().split(columnSeparator)
assert len(split) >= requiredColumns, 'Line %d contains only %d columns when %d are required' % (lineno+1,len(split),requiredColumns)
termid,terms = split[idColumn],split[termsColumn]
for term in terms.split(termSeparator):
tempLookup[term.lower().strip()].add(termid)
for term,idlist in tempLookup.items():
lookup[term].add( (entityType,";".join(sorted(list(idlist)))) )
return dict(lookup)