Pratical Queries For Working With LSQ

For reference, here is a depicition of the LSQ data model Depiction of the LSQ2 Data Model

# Get all SELECT queries from DBpedia log 
?s <> ?text .
?s <> ?spin .
?spin a <> . 
# Get all SELECT queries from Semantic Web Dog Food along with timestamps (the original execution time on the endpoint)
PREFIX lsqv: <> 
PREFIX prov: <>
PREFIX sd: <>
SELECT Distinct ?text ?timeStamp From <>
?query lsqv:text ?text . 
?query lsqv:hasRemoteExec ?re .
?re prov:atTime ?timeStamp . 
?query lsqv:hasSpin ?spin .
?spin a <> . 
# Get all queries having resultset greater than zero. 
?s <> ?text .
?s <> ?re . 
?s <> ?le . 
?le <> ?qe . 
?qe <> ?rs  . 
FILTER(?rs > 0)
# Get all star shaped queries (There are four types of nodes in LSQ namely Star, Path, Hybrid, Sink). 
?s <> ?text . 
?s <> ?sf . 
?sf <> ?bgp .
?bgp <> ?jv .
?jv <> <>. 
# Queries along with their resultset sizes and runtimes
SELECT ?text ?rs ?sec
?s <> ?text . 
?s <> ?le .
?s <> ?re .
?le <> ?qe .
?qe <> ?rs. 
?qe <> ?sec 
# Queries with number of triple patterns
SELECT ?text ?tp 
WHERE { ?s <> ?text .
?s <> ?sf .
?sf <> ?tp. 
# Queries and their number of join vertices. 
SELECT ?text ?jv
?s <> ?text .
?s <> ?sf .
?sf <> ?jv. 
# Total queries having Filters 
SELECT (count(DISTINCT ?text) as ?totalFilter)
?s <> ?text . 
?s <> ?sf .
?sf <> <> . 
# Total queries having Solution modifiers
SELECT (count(DISTINCT ?text) as ?totalMod) 
?s <> ?text .
?s <> ?sf . 
?sf <> ?uf .
FILTER(?uf = <> || ?uf = <> || ?uf = <>) 
# Total queries having SPARQL functions 
SELECT (count(DISTINCT ?text) as ?totalFunc)
?s <> ?text . 
?s <> ?sf . 
?sf <>  <> 
# Count of property paths
PREFIX lsqv: <> 
SELECT ?o (COUNT(*) AS ?ppathCount) 
?s lsqv:usesFeature ?o . 
FILTER(?o IN (lsqv:LinkPath, lsqv:ReverseLinkPath, lsqv:NegPropSetPath , lsqv:InversePath, lsqv:ModPath, lsqv:FixedLengthPath, lsqv:DistinctPath, lsqv:MultiPath, lsqv:ShortestPath, lsqv:ZeroOrOnePath)) } GROUP BY ?o ORDER BY DESC(COUNT(*)
# Top objects ordered by query count	
SELECT ?object COUNT(Distinct ?query) as ?queryCount
?s <> ?query .
?s <> ?sf .
?sf <> ?tp .
?tp <> ?object
GROUP BY ?object
ORDER BY DESC(?queryCount)
# Various features of SPARQL queries
PREFIX lsqv: <>
SELECT  DISTINCT  ?qId  ?joinVertices ?tps ?rs ?rt ?meanJoinVertexDegree 
?qId  lsqv:text ?text .
?qId  lsqv:hasRemoteExec ?re . 
?qId  lsqv:hasLocalExec ?le . 
?qId  lsqv:hasStructuralFeatures ?sf .
?sf   lsqv:projectVarCount ?projVars.
?sf   lsqv:joinVertexCount ?joinVertices . 
?sf   lsqv:tpCount ?tps .
?sf   lsqv:joinVertexDegreeMean ?meanJoinVertexDegree . 
?sf   lsqv:usesFeature  lsq:Select  .  
?le   lsqv:hasQueryExec ?qe . 
?qe   lsqv:resultCount ?rs. 
?qe   lsqv:evalDuration ?rt. 
FILTER (?rs > 0 && ?rs < 20000000 && ?tps > 0)
LIMIT 1000000
# find queries with dbo:Actor as an object in a triple pattern
PREFIX dbo: <>
PREFIX lsqv: <>
PREFIX sp: <>

SELECT DISTINCT ?text ?query
  ?query lsqv:text ?text .
  ?query lsqv:hasStructuralFeatures/lsqv:hasBgp/lsqv:hasTpInBgp/lsqv:hasTp/sp:object dbo:Actor .
# find queries with the actor keyword
PREFIX dbo: <>
PREFIX lsqv: <>
PREFIX sp: <>

SELECT DISTINCT ?text ?query
  ?query lsqv:text ?text .
  ?text bif:contains "actor" .

The following result sets of SPARQL queries are based on the LSQ output of this query:

PREFIX swc:  <>
  ?obj a swc:SessionEvent ;
  ?prop  ?target

Querying The Static Structure

PREFIX lsqv: <>
PREFIX rdfs: <>
PREFIX xsd: <>

SELECT ?bgpLabel ?bgpNodeLabel ?subBgpLabel ?subTpLabel {
  ?query lsq:hasStructuralFeatures ?sf .

  ?sf         lsqv:hasBgp     ?bgp .
  ?bgp        lsqv:hasBgpNode ?bgpNode .
  ?bgpNode    lsqv:hasSubBgp  ?subBgp .
  ?subBgp     lsqv:hasTpInBgp ?subTpInBgp .
  ?subTpInBgp lsqv:hasTp      ?subTp .

  ?bgp     rdfs:label ?bgpLabel .
  ?bgpNode rdfs:label ?bgpNodeLabel .
  ?subBgp  rdfs:label ?subBgpLabel .
  ?subTp   rdfs:label ?subTpLabel .

} ORDER BY ?bgpLabel ?bgpNodeLabel ?subBgpLabel ?subTpLabel

Accessing the RDF terms and variables of a query’s triple patterns (via the BGPs)

PREFIX lsqv: <>
PREFIX sp: <>

SELECT ?tpLabel ?s ?p ?o {
  { SELECT * { ?query lsqv:hasStructuralFeatures ?sf } LIMIT 1 }

  Graph ?g {
    ?sf         lsqv:hasBgp     ?bgp .
    ?bgp        lsqv:hasTpInBgp ?tpInBgp .
    ?tpInBgp    lsqv:hasTp      ?tp .

    ?bgp rdfs:label ?bgpLabel .
    ?tp  rdfs:label ?tpLabel .

    ?tp sp:subject ?s .
    ?tp sp:predicate ?p .
    ?tp sp:object ?o .
| bgpLabel                                                                                          | bgpNodeLabel | subBgpLabel                                                                                       | subTpLabel                                                                                                          |
| "?obj  a      <> ;\n      ?prop  ?target" | "?obj"       | "?obj  a      <> ;\n      ?prop  ?target" | "?obj <> <>" |
| "?obj  a      <> ;\n      ?prop  ?target" | "?obj"       | "?obj  a      <> ;\n      ?prop  ?target" | "?obj ?prop ?target"                                                                                                |
| "?obj  a      <> ;\n      ?prop  ?target" | "?prop"      | "?obj  ?prop  ?target"                                                                            | "?obj ?prop ?target"                                                                                                |
| "?obj  a      <> ;\n      ?prop  ?target" | "?target"    | "?obj  ?prop  ?target"                                                                            | "?obj ?prop ?target"                                                                                                |

Querying Over Executions

The following elements of SPARQL queries are evaluated in benchmark runs which in essence leads to statistical observations w.r.t. those elements. The affected elements are: bgp, tpInBgp, tp and bgpNode.

The query is admittedly quite large but the essence is that from a given execution of a query (via lsq:hasLocalExec) all corresponding executions of that query’s constituent elements are unambiguously accessible. Each execution is linked to the appropriate element via lsq:hasExec in reverse direction. Note, that the rationale for the reverse link is that from an element all corresponding executions are reachable via forward links which is aimed at providing nicer Linked Data views.

PREFIX lsqv: <>
PREFIX rdfs: <>
PREFIX xsd: <>

SELECT ?exp ?bgpLabel ?bgpNodeLabel ?subBgpLabel ?subTpLabel ?bgpSize  ?subTpSize ?subTpToBgpRatio {
  ?query          lsqv:hasLocalExec   ?localExec .
  ?localExec      lsqv:hasBgpExec     ?bgpExec .
  ?bgpExec        lsqv:hasJoinVarExec ?bgpNodeExec .
  ?bgpNodeExec    lsqv:hasSubBgpExec  ?subBgpExec .
  ?subBgpExec     lsqv:hasTpInBgpExec ?subTpInBgpExec .
  ?subTpInBgpExec lsqv:hasTpExec      ?subTpExec .

  # Links from the executions to the query's elements
  ?bgp     lsqv:hasExec ?bgpExec     ; rdfs:label ?bgpLabel .
  ?bgpNode lsqv:hasExec ?bgpNodeExec ; rdfs:label ?bgpNodeLabel .
  ?subBgp  lsqv:hasExec ?subBgpExec  ; rdfs:label ?subBgpLabel .
  ?subTp   lsqv:hasExec ?subTpExec   ; rdfs:label ?subTpLabel .

  ?localExec lsqv:benchmarkRun ?exp .  

  # Get the measurements
  ?subBgpExec lsqv:hasElementExec [ lsqv:resultCount ?bgpSize   ] .
  ?subTpExec  lsqv:hasElementExec [ lsqv:resultCount ?subTpSize ] .

  ?subTpInBgpExec lsqv:tpToBgpRatio ?subTpToBgpRatio .

  # Further useful triple patterns
  # ?query lsqv:hash ?queryHash .
  # ?exp dct:identifier ?expId .

  # bgp/tp selectivities (may be absent if involved result sets exceeded benchmark limits)
  # OPTIONAL { ?subTpInBgpExec lsqv:bgpRestrictedTpSel ?bgpRestrictedTpSel }

} ORDER BY ?exp ?bgpLabel ?bgpNodeLabel ?subBgpLabel
| exp                                                            | bgpLabel                                                | bgpNodeLabel | subBgpLabel                                             | subTpLabel                | bgpSize | subTpSize | subTpToBgpRatio |
| | "?obj  a      swc:SessionEvent ;\n      ?prop  ?target" | "?obj"       | "?obj  a      swc:SessionEvent ;\n      ?prop  ?target" | "?obj a swc:SessionEvent" | 0       | 0         | 0               |
| | "?obj  a      swc:SessionEvent ;\n      ?prop  ?target" | "?obj"       | "?obj  a      swc:SessionEvent ;\n      ?prop  ?target" | "?obj ?prop ?target"      | 0       | 900       | 0               |
| | "?obj  a      swc:SessionEvent ;\n      ?prop  ?target" | "?prop"      | "?obj  ?prop  ?target"                                  | "?obj ?prop ?target"      | 900     | 900       | 1               |
| | "?obj  a      swc:SessionEvent ;\n      ?prop  ?target" | "?target"    | "?obj  ?prop  ?target"                                  | "?obj ?prop ?target"      | 900     | 900       | 1               |

Find Sparse Join Candidates among BGPs

# Find Sparse join candidates: Search for basic graph patterns that have significantly
# fewer results than the smallest result set among its triple patterns.
PREFIX lsqv: <>
PREFIX rdfs: <>
PREFIX xsd: <>

SELECT ?exp ?text ?bgpLabel ?bgpLabel ?bgpLabel ?tpLabel ?bgpSize  ?tpSize ?bgpTpSizeRatio {
  { SELECT * { # Comment out this SELECT block to run this query on all data

    ?query          lsqv:hasLocalExec   ?localExec .
    ?localExec      lsqv:hasBgpExec     ?bgpExec .

    # Links from the executions to the query's elements
    ?bgp     lsqv:hasExec ?bgpExec     ; rdfs:label ?bgpLabel .
    ?bgpExec lsqv:hasElementExec [ lsqv:resultCount ?bgpSize ] .
    # Discard bgps with empty results
    FILTER(?bgpSize > 0)
    # For the current bgpExec, get the tp with the smallest result set size
    SELECT ?bgpExec ?tpExec ?tpSize {
      ?bgpExec        lsqv:hasTpInBgpExec ?tpInBgpExec .
      ?tpInBgpExec    lsqv:hasTpExec      ?tpExec .
      ?tpExec  lsqv:hasElementExec [ lsqv:resultCount ?tpSize ] .
    } ORDER BY ASC(?tpSize) LIMIT 1

  # Compute the ratio of the bgp size vs smallest tp size
  BIND(?bgpSize / ?tpSize AS ?bgpTpSizeRatio)
  ?tp      lsqv:hasExec ?tpExec  ; rdfs:label ?tpLabel .
  ?localExec lsqv:benchmarkRun ?exp .  
  ?query lsqv:text ?text
  } LIMIT 1000 }
ORDER BY ASC(?bgpTpSizeRatio)
| exp                                                                | bgpLabel                                                                                                                                                                                                                                                                                                                                                                                                                       | tpLabel                                                                                          | bgpSize                                        | tpSize                                           | bgpTpSizeRatio             |
| <> | "?id  a                     <> ;\n     <>  ?ft ;\n     <>  ?imdb_id"                                                                                                                                                                                                                                                        | "?id <> ?imdb_id"                                              | "23"^^<>  | "70876"^^<> | 0.000324510412551498391557 |
| <> | "?artist  a                     <> ;\n         <>  ?name ;\n         <>  <> ;\n         <>  <> ;\n         <>  <>" | "?artist <> <>"            | "1"^^<>   | "356"^^<>   | 0.002808988764044943820225 |
| <> | "?author  a                     <> .\n?film    <>  ?author .\n?actor   <>  ?film .\n?author  <>  ?nytId"                                                                                                                                                                           | "?author <> <>" | "116"^^<> | "30649"^^<> | 0.003784789063264706841985 |