If you need a somewhat complex dataset for testing plot features in Scala, read this page.
Author: Jim Pivarski
Rationale
Most of the examples must use some dataset, so I prepared a sample of CMS public data to be read with no dependencies. The data came from the CERN Open Data portal; it is 50 fb-1 of highly processed particle physics data from the CMS experiment, with 469,384 events selected by the 24 GeV/c isolated muon trigger.
For convenience, it has been converted to compressed JSON. The code that reads it into classes is provided below for you to copy-paste, rather than a JAR to load, because I want you to see how it’s done and you may want to modify it.
Loading the data quickly
The data-loading code is now included in the core Histogrammar library (as of 1.0.3), so you just do this:
import org.dianahep.histogrammar.tutorial.cmsdata
val events = cmsdata.EventIterator()
For older versions of Histogrammar or to see what this is doing, read below.
Pedestrian approach
Start a Scala prompt with the Histogrammar JAR loaded. That is, do
scala -cp histogrammar-0.7.jar
for an appropriate version of Histogrammar to get a scala> prompt. Then either
- copy-paste the code into your terminal (watching for possible error messages), or
- download the raw code and type
:load /path/to/scala-cmsdata.scalato load it.
If all goes well, you’ll have an iterator named events that pulls data from the web as needed. You get events by repeatedly calling events.next(). To restart the iterator from the beginning, do val events = EventIterator().
The code to copy-paste is below and the link to raw code is here.
// class definitions
trait LorentzVector extends Serializable {
// abstract members; must be defined by subclasses
def px: Double
def py: Double
def pz: Double
def E: Double
// methods common to all LorentzVectors
def pt = Math.sqrt(px*px + py*py)
def p = Math.sqrt(px*px + py*py + pz*pz)
def mass = Math.sqrt(E*E - px*px - py*py - pz*pz)
def eta = 0.5*Math.log((p + pz)/(p - pz))
def phi = Math.atan2(py, px)
// addition operator is a method named "+"
def +(two: LorentzVector) = {
val one = this
// create a subclass and an instance in one block
new LorentzVector {
def px = one.px + two.px
def py = one.py + two.py
def pz = one.pz + two.pz
def E = one.E + two.E
override def toString() = s"LorentzVector($px, $py, $pz, $E)"
}
}
}
// particle class definitions are now one-liners
case class Jet(px: Double, py: Double, pz: Double, E: Double, btag: Double) extends LorentzVector
case class Muon(px: Double, py: Double, pz: Double, E: Double, q: Int, iso: Double) extends LorentzVector
case class Electron(px: Double, py: Double, pz: Double, E: Double, q: Int, iso: Double) extends LorentzVector
case class Photon(px: Double, py: Double, pz: Double, E: Double, iso: Double) extends LorentzVector
case class MET(px: Double, py: Double) {
def pt = Math.sqrt(px*px + py*py)
}
case class Event(jets: Seq[Jet], muons: Seq[Muon], electrons: Seq[Electron], photons: Seq[Photon], met: MET, numPrimaryVertices: Long)
// event data iterator
case class EventIterator(location: String = "http://histogrammar.org/docs/data/triggerIsoMu24_50fb-1.json.gz") extends Iterator[Event] {
import org.dianahep.histogrammar.json._
// use Java libraries to stream and decompress data on-the-fly
@transient val scanner = new java.util.Scanner(
new java.util.zip.GZIPInputStream(
new java.net.URL(location).openStream))
// read one ahead so that hasNext can effectively "peek"
private def getNext() =
try {
Json.parse(scanner.nextLine) collect {
case event: JsonObject => eventFromJson(event)
}
}
catch {
case err: java.util.NoSuchElementException => None
}
private var theNext = getNext()
// iterator interface
def hasNext = !theNext.isEmpty
def next() = {
val out = theNext.get
theNext = getNext()
out
}
def jetFromJson(params: Map[String, JsonNumber]) =
new Jet(params("px").toDouble,
params("py").toDouble,
params("pz").toDouble,
params("E").toDouble,
params("btag").toDouble)
def muonFromJson(params: Map[String, JsonNumber]) =
new Muon(params("px").toDouble,
params("py").toDouble,
params("pz").toDouble,
params("E").toDouble,
params("q").toInt,
params("iso").toDouble)
def electronFromJson(params: Map[String, JsonNumber]) =
new Electron(params("px").toDouble,
params("py").toDouble,
params("pz").toDouble,
params("E").toDouble,
params("q").toInt,
params("iso").toDouble)
def photonFromJson(params: Map[String, JsonNumber]) =
new Photon(params("px").toDouble,
params("py").toDouble,
params("pz").toDouble,
params("E").toDouble,
params("iso").toDouble)
def metFromJson(params: Map[String, JsonNumber]): MET =
new MET(params("px").toDouble, params("py").toDouble)
def eventFromJson(params: JsonObject) = {
val JsonArray(jets @ _*) = params("jets")
val JsonArray(muons @ _*) = params("muons")
val JsonArray(electrons @ _*) = params("electrons")
val JsonArray(photons @ _*) = params("photons")
val met = params("MET").asInstanceOf[JsonObject]
val JsonInt(numPrimaryVertices) = params("numPrimaryVertices")
new Event(
jets collect {case j: JsonObject => jetFromJson(j.to[JsonNumber].toMap)},
muons collect {case j: JsonObject => muonFromJson(j.to[JsonNumber].toMap)},
electrons collect {case j: JsonObject => electronFromJson(j.to[JsonNumber].toMap)},
photons collect {case j: JsonObject => photonFromJson(j.to[JsonNumber].toMap)},
metFromJson(met.to[JsonNumber].toMap),
numPrimaryVertices)
}
}
val events = EventIterator()
