Source code for kindred.pubannotation

"""
Importer for PubAnnotation data
"""

import kindred
import requests
import re

[docs]def load(projectName):
	"""
	Download and load the corresponding corpus from the PubAnnotation resource
	
	:param projectName: The name of the PubAnnotation project to download
	:type projectName: str
	:return: The loaded corpus
	:rtype: kindred.Corpus
	"""
	projectURL = "http://pubannotation.org/projects/%s/docs.json" % projectName
	
	loaded = kindred.Corpus()
	
	docs = requests.get(projectURL)
	for doc in docs.json():
		m = re.search("sourcedb/(?P<sourcedb>[^\/]*)/sourceid/(?P<sourceid>[0-9]*)",doc['url'])
		mDict = m.groupdict()
		
		assert 'sourcedb' in mDict
		assert 'sourceid' in mDict
		
		annotationsURL = "http://pubannotation.org/projects/%s/docs/sourcedb/%s/sourceid/%s/annotations.json" % (projectName,mDict['sourcedb'],mDict['sourceid'])
		
		annotations = requests.get(annotationsURL).json()
		
		
		assert isinstance(annotations,list) or isinstance(annotations,dict)
		
		if isinstance(annotations,list):
			for annotation in annotations:
				parsed = kindred.loadFunctions.parsePubAnnotationJSON(annotation)
				loaded.addDocument(parsed)
		elif isinstance(annotations,dict):
			parsed = kindred.loadFunctions.parsePubAnnotationJSON(annotations)
			loaded.addDocument(parsed)
	
	return loaded