Source code for skosprovider_getty.utils

# -*- coding: utf-8 -*-

'''
This module contains utility functions for :mod:`skosprovider_getty`.
'''
import requests
import rdflib
from rdflib.graph import Graph
from rdflib.term import URIRef
from requests.packages.urllib3.exceptions import ConnectionError
from skosprovider.exceptions import ProviderUnavailableException

from skosprovider.skos import (
    Concept,
    Collection,
    Label,
    Note,
    ConceptScheme)

import logging
log = logging.getLogger(__name__)

from rdflib.namespace import RDFS, RDF, SKOS, DC

PROV = rdflib.Namespace('http://www.w3.org/ns/prov#')
ISO = rdflib.Namespace('http://purl.org/iso25964/skos-thes#')
GVP = rdflib.Namespace('http://vocab.getty.edu/ontology#')

[docs]def conceptscheme_from_uri(conceptscheme_uri, **kwargs): ''' Read a SKOS Conceptscheme from a :term:`URI` :param string conceptscheme_uri: URI of the conceptscheme. :rtype: skosprovider.skos.ConceptScheme ''' # get the conceptscheme # ensure it only ends in one slash conceptscheme_uri = conceptscheme_uri.strip('/') + '/' s = kwargs.get('session', requests.Session()) graph = uri_to_graph('%s.rdf' % (conceptscheme_uri), session=s) notes = [] labels = [] if graph is not False: for s, p, o in graph.triples((URIRef(conceptscheme_uri), RDFS.label, None)): label = Label(o.toPython(), "prefLabel", 'en') labels.append(label) conceptscheme = ConceptScheme( conceptscheme_uri, labels=labels, notes=notes ) return conceptscheme
def things_from_graph(graph, subclasses, conceptscheme, **kwargs): s = kwargs.get('session', requests.Session()) graph = graph clist = [] concept_graph = Graph() collection_graph = Graph() for sc in subclasses.get_subclasses(SKOS.Concept): concept_graph += graph.triples((None, RDF.type, sc)) for sc in subclasses.get_subclasses(SKOS.Collection): collection_graph += graph.triples((None, RDF.type, sc)) for sub, pred, obj in concept_graph.triples((None, RDF.type, None)): uri = str(sub) matches = {} for k in Concept.matchtypes: matches[k] = _create_from_subject_predicate(graph, sub, URIRef(SKOS + k + 'Match')) con = Concept( uri_to_id(uri), uri=uri, concept_scheme = conceptscheme, labels = _create_from_subject_typelist(graph, sub, Label.valid_types), notes = _create_from_subject_typelist(graph, sub, hierarchy_notetypes(Note.valid_types)), sources = [], broader = _create_from_subject_predicate(graph, sub, SKOS.broader), narrower = _create_from_subject_predicate(graph, sub, SKOS.narrower), related = _create_from_subject_predicate(graph, sub, SKOS.related), subordinate_arrays = _create_from_subject_predicate(graph, sub, ISO.subordinateArray), matches=matches ) clist.append(con) for sub, pred, obj in collection_graph.triples((None, RDF.type, None)): uri = str(sub) col = Collection( uri_to_id(uri), uri=uri, concept_scheme = conceptscheme, labels = _create_from_subject_typelist(graph, sub, Label.valid_types), notes = _create_from_subject_typelist(graph, sub, hierarchy_notetypes(Note.valid_types)), sources = [], members = _create_from_subject_predicate(graph, sub, SKOS.member), superordinates = _get_super_ordinates(conceptscheme, sub, session=s) ) clist.append(col) return clist def _create_from_subject_typelist(graph, subject, typelist): list = [] note_uris = [] for p in typelist: term = SKOS.term(p) list.extend(_create_from_subject_predicate(graph, subject, term, note_uris)) return list def _get_super_ordinates(conceptscheme, sub, **kwargs): ret = [] query = """PREFIX ns:<%s> SELECT * WHERE {?s iso-thes:subordinateArray ns:%s}""" % (conceptscheme.uri, uri_to_id(sub)) request = conceptscheme.uri.strip('/').rsplit('/', 1)[0] + "/sparql.json" s = kwargs.get('session', requests.Session()) try: res = s.get(request, params={"query": query}) except ConnectionError as e: raise ProviderUnavailableException("Request could not be executed - Request: %s - Params: %s" % (request, query)) if res.status_code == 404: raise ProviderUnavailableException("Service not found (status_code 404) - Request: %s - Params: %s" % (request, query)) if not res.encoding: res.encoding = 'utf-8' r = res.json() for result in r["results"]["bindings"]: ret.append(uri_to_id(result["s"]["value"])) return ret def _create_from_subject_predicate(graph, subject, predicate, note_uris=None): list = [] for s, p, o in graph.triples((subject, predicate, None)): type = predicate.split('#')[-1] if Label.is_valid_type(type): o = _create_label(o, type) elif Note.is_valid_type(type): if o.toPython() not in note_uris: note_uris.append(o.toPython()) o = _create_note(graph, o, type, False) else: o = None else: o = uri_to_id(o) if o: list.append(o) return list def _create_label(literal, type): language = literal.language if language is None: language = 'und' try: l = Label(literal.toPython(), type, language) except ValueError as e: log.warn(e) l = Label(literal.toPython(), type, 'und') return l def _create_note(graph, uri, type, change_notes=False): if not change_notes and '/rev/' in uri: return None else: note = u'' language = 'en' # http://vocab.getty.edu/aat/scopeNote for s, p, o in graph.triples((uri, RDF.value, None)): note += o.toPython() language = o.language # for http://vocab.getty.edu/aat/rev/ for s, p, o in graph.triples((uri, DC.type, None)): note += o.toPython() for s, p, o in graph.triples((uri, DC.description, None)): note += ': %s' % o.toPython() for s, p, o in graph.triples((uri, PROV.startedAtTime, None)): note += ' at %s ' % o.toPython() return Note(note, type, language)
[docs]class SubClassCollector: ''' A utility class to collect all the subclasses of a certain Class from an ontology file. ''' def __init__(self, namespace): self.ontology_graphs = {} self.namespace = namespace self.init_skos() def init_skos(self): self.subclasses = {} self.subclasses[SKOS.Concept] = [ SKOS.Concept, GVP.Concept, GVP.PhysPlaceConcept, GVP.PhysAdminPlaceConcept, GVP.AdminPlaceConcept, GVP.PersonConcept, GVP.UnknownPersonConcept, GVP.GroupConcept ] self.subclasses[SKOS.Collection] = [ SKOS.Collection, SKOS.OrderedCollection, ISO.ThesaurusArray, GVP.Hierarchy, GVP.Facet, GVP.GuideTerm ] def get_subclasses(self, clazz): return self.subclasses[clazz] def collect_subclasses(self, clazz): self.subclasses[clazz] = [clazz] if self.namespace not in self.ontology_graphs: try: graph = rdflib.Graph() result = graph.parse(str(self.namespace), format="application/rdf+xml") self.ontology_graphs[self.namespace] = graph except: self.ontology_graphs[self.namespace] = None g = self.ontology_graphs[self.namespace] if not g is None: for sub, pred, obj in g.triples((None, RDFS.subClassOf, None)): self._is_subclass_of(sub, clazz) return self.subclasses[clazz] def _is_subclass_of(self, subject, clazz): namespace = subject.split('#')[0] + "#" if subject in self.subclasses[clazz]: return True if namespace not in self.ontology_graphs: try: graph = rdflib.Graph() result = graph.parse(str(namespace), format="application/rdf+xml") self.ontology_graphs[namespace] = graph except: self.ontology_graphs[namespace] = None g = self.ontology_graphs[namespace] if not g is None: for sub, pred, obj in g.triples((subject, RDFS.subClassOf, None)): if obj in self.subclasses[clazz]: self.subclasses[clazz].append(subject) return True if obj == clazz: self.subclasses[clazz].append(subject) return True if self._is_subclass_of(obj, clazz): return True return False
def hierarchy_notetypes(list): # A getty scopeNote wil be of type skos.note and skos.scopeNote # To avoid doubles and to make sure the getty scopeNote will have type skos.scopeNote and not skos.note, # the skos.note will be added at the end of the list index_note = list.index('note') if index_note != -1: list.pop(index_note) list.append('note') return list def uri_to_id(uri): return uri.strip('/').rsplit('/', 1)[1]
[docs]def uri_to_graph(uri, **kwargs): ''' :param string uri: :term:`URI` where the RDF data can be found. :rtype: rdflib.Graph or `False` if the URI does not exist :raises skosprovider.exceptions.ProviderUnavailableException: if the getty.edu services are down ''' s = kwargs.get('session', requests.Session()) graph = rdflib.Graph() try: res = s.get(uri) except requests.ConnectionError as e: raise ProviderUnavailableException("URI not available: %s" % uri) if res.status_code == 404: return False graph.parse(data=res.content) return graph