Tracking provenance in ODIS

Tracking provenance in ODIS#

Introduction#

Provenance tells us where something came from, and what happened to it on the way. Accurately tracking provenance has always been essential to building trust in any object (e.g. Is the antique I have at home genuine? Has the evidence used in a court case been handled in a secure chain of custody?). Provenance in digital (meta)data management is no different, and the more clearly and completely your (meta)data’s provenance is documented, the more third parties will be able to (re)use it and trust it.

Here, we provide some guidance on how to encode provenance information in JSON-LD/schema.org, for discovery across the ODIS Federation and its users.

Gleaner Prov#

The Gleaner application generates a prov graph of the activity of accessing and indexing provider resources. The main goal of this prov is to connect an indexed URL to the digital object stored in the object store. This digital object should be the JSON-LD data graph presented by the provider.

By contrast, the authoritative reference in the various profiles will connect the the data graph ID, or in the absence of that the data graph URL or the referenced resources URL by gleaner, to another reference. This may be an organization ID or a PID of the connected resource.

 1{
 2    "@context": {
 3        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
 4        "prov": "http://www.w3.org/ns/prov#",
 5        "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
 6    },
 7    "@graph": [
 8        {
 9            "@id": "https://www.re3data.org/repository/obis",
10            "@type": "prov:Organization",
11            "rdf:name": "Ocean Biodiversity Information System",
12            "rdfs:seeAlso": "https://obis.org"
13        },
14        {
15            "@id": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2",
16            "@type": "prov:Entity",
17            "prov:wasAttributedTo": {
18                "@id": "https://www.re3data.org/repository/obis"
19            },
20            "prov:value": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2"
21        },
22        {
23            "@id": "https://gleaner.io/id/collection/7c1eaa1aaed95861330109026c42e57a31ecae55",
24            "@type": "prov:Collection",
25            "prov:hadMember": {
26                "@id": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2"
27            }
28        },
29        {
30            "@id": "urn:gleaner:milled:obis:7c1eaa1aaed95861330109026c42e57a31ecae55",
31            "@type": "prov:Entity",
32            "prov:value": "7c1eaa1aaed95861330109026c42e57a31ecae55.jsonld"
33        },
34        {
35            "@id": "https://gleaner.io/id/run/7c1eaa1aaed95861330109026c42e57a31ecae55",
36            "@type": "prov:Activity",
37            "prov:endedAtTime": {
38                "@value": "2021-04-20",
39                "@type": "http://www.w3.org/2001/XMLSchema#dateTime"
40            },
41            "prov:generated": {
42                "@id": "urn:gleaner:milled:obis:7c1eaa1aaed95861330109026c42e57a31ecae55"
43            },
44            "prov:used": {
45                "@id": "https://gleaner.io/id/collection/7c1eaa1aaed95861330109026c42e57a31ecae55"
46            }
47        }
48    ]
49}

Hide code cell source

import json
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
from pyld import jsonld
import graphviz
import os, sys

currentdir = os.path.dirname(os.path.abspath(''))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
from lib import jbutils

with open("../../../odis-in/dataGraphs/indexing/prov/graphs/gleaner.json") as dgraph:
    doc = json.load(dgraph)

frame = {
  "@context": {"@vocab": "https://schema.org/",
  "prov": "http://www.w3.org/ns/prov#"},
  "@explicit": "false",
  "@type":     "prov:Activity",
   "prov:generated": {},
   "prov:endedAtTime": {},
   "prov:used": {}
}


context = {
  "@vocab": "https://schema.org/",
  "prov": "http://www.w3.org/ns/prov#"
}

compacted = jsonld.compact(doc, context)

framed = jsonld.frame(compacted, frame)
jd = json.dumps(framed, indent=4)
print(jd)
{
    "@context": {
        "@vocab": "https://schema.org/",
        "prov": "http://www.w3.org/ns/prov#"
    },
    "@id": "https://gleaner.io/id/run/7c1eaa1aaed95861330109026c42e57a31ecae55",
    "@type": "prov:Activity",
    "prov:endedAtTime": {
        "@type": "http://www.w3.org/2001/XMLSchema#dateTime",
        "@value": "2021-04-20"
    },
    "prov:generated": {
        "@id": "urn:gleaner:milled:obis:7c1eaa1aaed95861330109026c42e57a31ecae55",
        "@type": "prov:Entity",
        "prov:value": "7c1eaa1aaed95861330109026c42e57a31ecae55.jsonld"
    },
    "prov:used": {
        "@id": "https://gleaner.io/id/collection/7c1eaa1aaed95861330109026c42e57a31ecae55",
        "@type": "prov:Collection",
        "prov:hadMember": {
            "@id": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2",
            "@type": "prov:Entity",
            "prov:value": "https://obis.org/dataset/9381239f-3d64-48b4-80c9-b9ebb674edc2",
            "prov:wasAttributedTo": {
                "@id": "https://www.re3data.org/repository/obis",
                "@type": "prov:Organization",
                "http://www.w3.org/1999/02/22-rdf-syntax-ns#name": "Ocean Biodiversity Information System",
                "http://www.w3.org/2000/01/rdf-schema#seeAlso": "https://obis.org"
            }
        }
    }
}

Nano Prov#

This is a basic nanoprov example. Note, this is a draft and the ID connections and examples have not been made yet.

 1{
 2    "@context": {
 3        "gleaner": "https://voc.gleaner.io/id/",
 4        "np": "http://www.nanopub.org/nschema#",
 5        "prov": "http://www.w3.org/ns/prov#",
 6        "xsd": "http://www.w3.org/2001/XMLSchema#"
 7    },
 8    "@set": [
 9        {
10            "@id": "gleaner:nanopub/XID",
11            "@type": "np:NanoPublication",
12            "np:hasAssertion": {
13                "@id": "gleaner:nanopub/XID#assertion"
14            },
15            "np:hasProvenance": {
16                "@id": "gleaner:nanopub/XID#provenance"
17            },
18            "np:hasPublicationInfo": {
19                "@id": "gleaner:nanopub/XID#pubInfo"
20            }
21        },
22        {
23            "@id": "gleaner:nanopub/XID#assertion",
24            "@graph": {
25                "@id": "DataSetURI",
26                "@type": "schema:Dataset",
27                "description": "This is where you would put corrections or annotations",
28                "identifier": [
29                    {
30                        "@type": "schema:PropertyValue",
31                        "name": "GraphSHA",
32                        "description": "A SHA256 sha stamp on the harvested data graph from a URL",
33                        "value": "{{SHA256 HASH HERE}}"
34                    },
35                    {
36                        "@type": "schema:PropertyValue",
37                        "name": "ProviderID",
38                        "description": "The id provided with the data graph by the provider",
39                        "value": "{{re3 or URL noted in config}}"
40                    },
41                    {
42                        "@type": "schema:PropertyValue",
43                        "name": "URL",
44                        "description": "The URL harvested by gleaner",
45                        "value": "{{The URL the JSON-LD came from}}"
46                    }
47                ]
48            }
49        },
50        {
51            "@id": "gleaner:nanopub/XID#provenance",
52            "@graph": {
53                "@id": "URIforprovondataset",
54                "prov:wasGeneratedAtTime": {
55                    "@value": "dateDone",
56                    "@type": "xsd:dateTime"
57                },
58                "prov:wasDerivedFrom": {
59                    "@id": "IDHERE"
60                },
61                "prov:wasAttributedTo": {
62                    "@id": "IDHERE"
63                }
64            }
65        },
66        {
67            "@id": "gleaner:nanopub/XID#pubInfo",
68            "@graph": {
69                "@id": "IDHERE",
70                "prov:wasAttributedTo": {
71                    "@id": "gleaner:tool/gleaner"
72                },
73                "prov:generatedAtTime": {
74                    "@value": "2019-10-23T14:38:00Z",
75                    "@type": "xsd:dateTime"
76                }
77            }
78        }
79    ]
80}

Hide code cell source

import json
from pyld import jsonld
import os, sys

currentdir = os.path.dirname(os.path.abspath(''))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
from lib import jbutils

with open("../../../odis-in/dataGraphs/indexing/prov/graphs/nanoprov.json") as dgraph:
    doc = json.load(dgraph)

context = {
    "@vocab": "https://schema.org/",
}

compacted = jsonld.compact(doc, context)
jbutils.show_graph(compacted)
../../_images/b06ceb92c218d6631622534fcf12fa2fc8c352e36c51d9c644dce71215284382.svg

Refs#

Nanopubs Guidance