Exploring data with SPARQL¶

Recipe Overview

Reading Time

30 minutes

Executable Code

Yes

Difficulty

FAIRifying Data Matrices - Step3 - Exploring data with SPARQL

Recipe Type

Hands-on

Audience

Principal Investigator, Data Manager, Data Scientist

Maturity Level & Indicator

DSM-1-C0

DSM-1-C1

Cite me with FCB040

Background:¶

Here, we show how to query the LinkedData graph using SPARQL to retrieve information about key study design descriptors such as study group size and treatment groups.

import rdflib
from rdflib import Graph, RDF
from IPython.core.display import display, HTML
import os
import json
import csv
import uuid

from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, JSONLD, CSV, TSV, N3, RDF, RDFXML, TURTLE
import pandas as pds
import itertools

import numpy as np
from plotnine import *

def queryResultToHTMLTable(queryResult):
    
   HTMLResult = '<table><tr style="color:white;background-color:#43BFC7;font-weight:bold">'
   # print variable names and build header:
   for varName in queryResult.vars:
       HTMLResult = HTMLResult + '<td>' + varName + '</td>'
   HTMLResult = HTMLResult + '</tr>'

   # print values from each row and build table of results
   for row in queryResult:
      HTMLResult = HTMLResult + '<tr>'   
      for column in row:
        #print("COLUMN:", column)
        if column is not "":
             HTMLResult = HTMLResult + '<td>' +  str(column) + '</td>'
        else:
             HTMLResult = HTMLResult + '<td>' + "N/A"+ '</td>'
      HTMLResult = HTMLResult + '</tr>'
   HTMLResult = HTMLResult + '</table>'
   display(HTML(HTMLResult))

def get_sparql_variables(results, sparql_wrapper="SPARQLWrapper2"):
#     return results.vars if ("sparqlwrapper2" == sparql_wrapper.lower()) else results['head']['vars']
    return results.vars if ("sparqlwrapper2" == sparql_wrapper.lower()) else results.vars


def get_sparql_bindings(results, sparql_wrapper="SPARQLWrapper2"):
    return results.bindings if ("sparqlwrapper2" == sparql_wrapper.lower()) else results['results']['bindings']


def get_sparql_binding_variable_value(binding, variable, sparql_wrapper="SPARQLWrapper2"):
    return binding[variable] if ("sparqlwrapper2" == sparql_wrapper.lower()) else binding[variable]['value']

def make_sparql_dict_list(bindings, variables, sparql_wrapper="SPARQLWrapper2"):
    def binding_value(binding, var): # helper function for returning values
        return get_sparql_binding_variable_value(binding, var, sparql_wrapper) if (var in binding) else None

    dict_list = []  # list to contain dictionaries
    for binding in itertools.chain(bindings):
        values = [binding_value(binding, var) for var in itertools.chain(variables)]
        dict_list.append(dict(zip(variables, values)))

    return dict_list


def make_sparql_df(results, sparql_wrapper="SPARQLWrapper2"):
    variables = get_sparql_variables(results, sparql_wrapper)
    # print(variables)

    cleaned_variables=[str(var.replace('\\n','')) for var in variables] 

    #print(cleaned_variables)
    bindings = get_sparql_bindings(results, sparql_wrapper)
    # print(bindings)
    
    # create a list of dictionaries to use as data for dataframe
    data_list = make_sparql_dict_list(bindings, cleaned_variables, sparql_wrapper)
    
    # print(data_list)

    df = pds.DataFrame(data_list) # create dataframe from data list
    df["sample_mean"] = df["sample_mean"].astype("float")

   # print(df["sample_mean"])
    return df[cleaned_variables] # return dataframe with columns reordered

*credits to Bob du Charme for the following function [http://www.snee.com/bobdc.blog/2016/07/sparql-in-a-jupyter-aka-ipytho.html]

g = Graph()

Let’s read the RDF graph generated using the rose-dtpkg2rdf.py python script and saved to disk as a turtle file

#g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-subset.ttl", format="n3")
g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")
# g.parse("./../data/processed/denovo/rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")

Now let’s ask for the independent variables and their levels using the following SPARQL query

get_idv_and_levels = g.query("""
PREFIX stato: <http://purl.obolibrary.org/obo/STATO_>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
PREFIX ncbitax: <http://purl.obolibrary.org/obo/NCBITaxon_>
prefix has_part: <http://purl.obolibrary.org/obo/BFO_0000051>
            SELECT DISTINCT
             ?Predictor
             ?PredictorLevel
             WHERE { 
                ?var a stato:0000087 ;
                    rdfs:label ?Predictor ;
                    has_part: ?value .
                ?value rdfs:label ?PredictorLevel .    
                 }               
""")

We can display the results of that query using the function declared earlier on

queryResultToHTMLTable(get_idv_and_levels)

Let’s now ask for the number of biological and technical replicates used to compute the mean concentration of the chemical compounds detected and forming the signature of the rose fragrance

get_replication_info = g.query("""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
prefix chmo:   <http://purl.obolibrary.org/obo/CHMO_> 
prefix msio:   <http://purl.obolibrary.org/obo/MSIO_> 
prefix stato: <http://purl.obolibrary.org/obo/STATO_> 
prefix obi: <http://purl.obolibrary.org/obo/OBI_> 
prefix ro: <http://purl.obolibrary.org/obo/RO_>
prefix po: <http://purl.obolibrary.org/obo/PO_>
prefix has_member: <http://purl.obolibrary.org/obo/RO_0002351>
prefix has_value: <http://purl.obolibrary.org/obo/STATO_0000129> 
prefix computed_from: <http://purl.obolibrary.org/obo/STATO_0000557>
prefix has_specified_input: <http://purl.obolibrary.org/obo/OBI_0000293>
prefix is_specified_output_of: <http://purl.obolibrary.org/obo/OBI_0000295>
prefix is_about: <http://purl.obolibrary.org/obo/IAO_0000136>

SELECT        
      ?TreatmentGroup 
      ?ChemicalCompound
      ?MeanConcentration
      (count(distinct ?member) as ?NbTechnicalReplicate) 
      (count(distinct ?input) as ?NbBiologicalReplicate)      
      WHERE {
            ?population a stato:0000193 ;
                rdfs:label ?TreatmentGroup ;
                has_member: ?member .      
            ?member has_specified_input: ?input .              
            ?mean a stato:0000402 ;
                computed_from: ?population ;
                has_value: ?MeanConcentration ;
                is_about: ?ChemicalCompound .
            ?concentration a stato:0000072;
                is_about: ?ChemicalCompound .
}
      GROUP BY ?population 
""")

Once more, we invoke the pretty printing function

queryResultToHTMLTable(get_replication_info)

get_all_data = g.query("""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
prefix chmo:   <http://purl.obolibrary.org/obo/CHMO_> 
prefix msio:   <http://purl.obolibrary.org/obo/MSIO_> 
prefix stato: <http://purl.obolibrary.org/obo/STATO_> 
prefix obi: <http://purl.obolibrary.org/obo/OBI_> 
prefix ro: <http://purl.obolibrary.org/obo/RO_>
prefix po: <http://purl.obolibrary.org/obo/PO_>
prefix chebi: <http://purl.obolibrary.org/obo/CHEBI_>
prefix has_value: <http://purl.obolibrary.org/obo/STATO_0000129>
prefix computed_from: <http://purl.obolibrary.org/obo/STATO_0000557>
prefix is_about: <http://purl.obolibrary.org/obo/IAO_0000136>
prefix is_denoted_by: <http://purl.obolibrary.org/obo/STATO_0000205>
prefix derives_from: <http://purl.obolibrary.org/obo/RO_0001000> 
prefix located_in: <http://purl.obolibrary.org/obo/RO_0001025>
prefix denotes: <http://purl.obolibrary.org/obo/IAO_0000219>
prefix measured_in: <http://purl.obolibrary.org/obo/RO_0002351> 

SELECT REDUCED  ?chemical_name ?chebi_identifier ?inchi ?sample_mean ?sem ?treatment ?genotype ?organism_part
WHERE {
        ?pop_mean a stato:0000402 ;
         is_about: ?chebi_identifier ;
         computed_from: ?population ;
         has_value: ?sample_mean .
     ?chem a ?chebi_identifier ;
         rdfs:label ?chemical_name ;
         is_denoted_by: ?inchi .
     ?semv a stato:0000037 ; 
         denotes: ?pop_mean ;
         has_value: ?sem.
     ?population a stato:0000193 ;
         rdfs:label ?treatment .
     ?sub_conc a stato:0000072 ;
         derives_from: ?genotype ;
         located_in: ?organism_part;
         measured_in: ?population .         
}        
""")

queryResultToHTMLTable(get_all_data)

data=make_sparql_df(get_all_data)

# width = figure_size[0]
# height = figure_size[0] * aspect_ratio
gray = '#666666'
orange = '#FF8000'
blue = '#3333FF'

p1 = (ggplot(data)
 + aes('chemical_name','sample_mean',fill='factor(treatment)')
 + geom_col()
+ facet_wrap('~treatment', dir='v',ncol=1)
 + scale_y_continuous(expand = (0,0))
 + theme(axis_text_x=element_text(rotation=90, hjust=1, fontsize=6, color=blue))
 + theme(axis_text_y=element_text(rotation=0, hjust=2, fontsize=6, color=orange))
         + theme(figure_size = (8, 16))
)

p1 + theme(panel_background=element_rect(fill=blue)
       )

p1