Exploring data with SPARQL¶
Background:¶
Here, we show how to query the LinkedData graph using SPARQL to retrieve information about key study design descriptors such as study group size and treatment groups.
import rdflib
from rdflib import Graph, RDF
from IPython.core.display import display, HTML
import os
import json
import csv
import uuid
from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, JSONLD, CSV, TSV, N3, RDF, RDFXML, TURTLE
import pandas as pds
import itertools
import numpy as np
from plotnine import *
def queryResultToHTMLTable(queryResult):
HTMLResult = '<table><tr style="color:white;background-color:#43BFC7;font-weight:bold">'
# print variable names and build header:
for varName in queryResult.vars:
HTMLResult = HTMLResult + '<td>' + varName + '</td>'
HTMLResult = HTMLResult + '</tr>'
# print values from each row and build table of results
for row in queryResult:
HTMLResult = HTMLResult + '<tr>'
for column in row:
#print("COLUMN:", column)
if column is not "":
HTMLResult = HTMLResult + '<td>' + str(column) + '</td>'
else:
HTMLResult = HTMLResult + '<td>' + "N/A"+ '</td>'
HTMLResult = HTMLResult + '</tr>'
HTMLResult = HTMLResult + '</table>'
display(HTML(HTMLResult))
def get_sparql_variables(results, sparql_wrapper="SPARQLWrapper2"):
# return results.vars if ("sparqlwrapper2" == sparql_wrapper.lower()) else results['head']['vars']
return results.vars if ("sparqlwrapper2" == sparql_wrapper.lower()) else results.vars
def get_sparql_bindings(results, sparql_wrapper="SPARQLWrapper2"):
return results.bindings if ("sparqlwrapper2" == sparql_wrapper.lower()) else results['results']['bindings']
def get_sparql_binding_variable_value(binding, variable, sparql_wrapper="SPARQLWrapper2"):
return binding[variable] if ("sparqlwrapper2" == sparql_wrapper.lower()) else binding[variable]['value']
def make_sparql_dict_list(bindings, variables, sparql_wrapper="SPARQLWrapper2"):
def binding_value(binding, var): # helper function for returning values
return get_sparql_binding_variable_value(binding, var, sparql_wrapper) if (var in binding) else None
dict_list = [] # list to contain dictionaries
for binding in itertools.chain(bindings):
values = [binding_value(binding, var) for var in itertools.chain(variables)]
dict_list.append(dict(zip(variables, values)))
return dict_list
def make_sparql_df(results, sparql_wrapper="SPARQLWrapper2"):
variables = get_sparql_variables(results, sparql_wrapper)
# print(variables)
cleaned_variables=[str(var.replace('\\n','')) for var in variables]
#print(cleaned_variables)
bindings = get_sparql_bindings(results, sparql_wrapper)
# print(bindings)
# create a list of dictionaries to use as data for dataframe
data_list = make_sparql_dict_list(bindings, cleaned_variables, sparql_wrapper)
# print(data_list)
df = pds.DataFrame(data_list) # create dataframe from data list
df["sample_mean"] = df["sample_mean"].astype("float")
# print(df["sample_mean"])
return df[cleaned_variables] # return dataframe with columns reordered
*credits to Bob du Charme for the following function [http://www.snee.com/bobdc.blog/2016/07/sparql-in-a-jupyter-aka-ipytho.html]
g = Graph()
Let’s read the RDF graph generated using the rose-dtpkg2rdf.py python script and saved to disk as a turtle file
#g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-subset.ttl", format="n3")
g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")
# g.parse("./../data/processed/denovo/rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")
Now let’s ask for the independent variables and their levels using the following SPARQL query
get_idv_and_levels = g.query("""
PREFIX stato: <http://purl.obolibrary.org/obo/STATO_>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX ncbitax: <http://purl.obolibrary.org/obo/NCBITaxon_>
prefix has_part: <http://purl.obolibrary.org/obo/BFO_0000051>
SELECT DISTINCT
?Predictor
?PredictorLevel
WHERE {
?var a stato:0000087 ;
rdfs:label ?Predictor ;
has_part: ?value .
?value rdfs:label ?PredictorLevel .
}
""")
We can display the results of that query using the function declared earlier on
queryResultToHTMLTable(get_idv_and_levels)
Let’s now ask for the number of biological and technical replicates used to compute the mean concentration of the chemical compounds detected and forming the signature of the rose fragrance
get_replication_info = g.query("""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix chmo: <http://purl.obolibrary.org/obo/CHMO_>
prefix msio: <http://purl.obolibrary.org/obo/MSIO_>
prefix stato: <http://purl.obolibrary.org/obo/STATO_>
prefix obi: <http://purl.obolibrary.org/obo/OBI_>
prefix ro: <http://purl.obolibrary.org/obo/RO_>
prefix po: <http://purl.obolibrary.org/obo/PO_>
prefix has_member: <http://purl.obolibrary.org/obo/RO_0002351>
prefix has_value: <http://purl.obolibrary.org/obo/STATO_0000129>
prefix computed_from: <http://purl.obolibrary.org/obo/STATO_0000557>
prefix has_specified_input: <http://purl.obolibrary.org/obo/OBI_0000293>
prefix is_specified_output_of: <http://purl.obolibrary.org/obo/OBI_0000295>
prefix is_about: <http://purl.obolibrary.org/obo/IAO_0000136>
SELECT
?TreatmentGroup
?ChemicalCompound
?MeanConcentration
(count(distinct ?member) as ?NbTechnicalReplicate)
(count(distinct ?input) as ?NbBiologicalReplicate)
WHERE {
?population a stato:0000193 ;
rdfs:label ?TreatmentGroup ;
has_member: ?member .
?member has_specified_input: ?input .
?mean a stato:0000402 ;
computed_from: ?population ;
has_value: ?MeanConcentration ;
is_about: ?ChemicalCompound .
?concentration a stato:0000072;
is_about: ?ChemicalCompound .
}
GROUP BY ?population
""")
Once more, we invoke the pretty printing function
queryResultToHTMLTable(get_replication_info)
get_all_data = g.query("""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix chmo: <http://purl.obolibrary.org/obo/CHMO_>
prefix msio: <http://purl.obolibrary.org/obo/MSIO_>
prefix stato: <http://purl.obolibrary.org/obo/STATO_>
prefix obi: <http://purl.obolibrary.org/obo/OBI_>
prefix ro: <http://purl.obolibrary.org/obo/RO_>
prefix po: <http://purl.obolibrary.org/obo/PO_>
prefix chebi: <http://purl.obolibrary.org/obo/CHEBI_>
prefix has_value: <http://purl.obolibrary.org/obo/STATO_0000129>
prefix computed_from: <http://purl.obolibrary.org/obo/STATO_0000557>
prefix is_about: <http://purl.obolibrary.org/obo/IAO_0000136>
prefix is_denoted_by: <http://purl.obolibrary.org/obo/STATO_0000205>
prefix derives_from: <http://purl.obolibrary.org/obo/RO_0001000>
prefix located_in: <http://purl.obolibrary.org/obo/RO_0001025>
prefix denotes: <http://purl.obolibrary.org/obo/IAO_0000219>
prefix measured_in: <http://purl.obolibrary.org/obo/RO_0002351>
SELECT REDUCED ?chemical_name ?chebi_identifier ?inchi ?sample_mean ?sem ?treatment ?genotype ?organism_part
WHERE {
?pop_mean a stato:0000402 ;
is_about: ?chebi_identifier ;
computed_from: ?population ;
has_value: ?sample_mean .
?chem a ?chebi_identifier ;
rdfs:label ?chemical_name ;
is_denoted_by: ?inchi .
?semv a stato:0000037 ;
denotes: ?pop_mean ;
has_value: ?sem.
?population a stato:0000193 ;
rdfs:label ?treatment .
?sub_conc a stato:0000072 ;
derives_from: ?genotype ;
located_in: ?organism_part;
measured_in: ?population .
}
""")
queryResultToHTMLTable(get_all_data)
data=make_sparql_df(get_all_data)
# width = figure_size[0]
# height = figure_size[0] * aspect_ratio
gray = '#666666'
orange = '#FF8000'
blue = '#3333FF'
p1 = (ggplot(data)
+ aes('chemical_name','sample_mean',fill='factor(treatment)')
+ geom_col()
+ facet_wrap('~treatment', dir='v',ncol=1)
+ scale_y_continuous(expand = (0,0))
+ theme(axis_text_x=element_text(rotation=90, hjust=1, fontsize=6, color=blue))
+ theme(axis_text_y=element_text(rotation=0, hjust=2, fontsize=6, color=orange))
+ theme(figure_size = (8, 16))
)
p1 + theme(panel_background=element_rect(fill=blue)
)
p1
References¶
References
Authors¶
Authors
Name |
ORCID |
Affiliation |
Type |
ELIXIR Node |
Contribution |
---|---|---|---|---|---|
University of Oxford |
Writing - Original Draft |
||||
University of Oxford |
Writing - Review & Editing, Funding Acquisition |