This exercise will get you started on using TMVA in pyroot. TMVA is the Toolkit for Multivariate Data Analysis with ROOT (more information http://tmva.sourceforge.net/) and it is a powerful tool to perform multivariate analysis.
import rootnotes
import rootprint
import utils
import ROOT
Create a ntuple and fill it with random numbers
# create a TNtuple
ntuple = ROOT.TNtuple("ntuple","ntuple","x:y:signal")
# generate 'signal' and 'background' distributions
for i in range(10000):
# throw a signal event centered at (1,1)
ntuple.Fill(ROOT.gRandom.Gaus(1,1), # x
ROOT.gRandom.Gaus(1,1), # y
1) # signal
# throw a background event centered at (-1,-1)
ntuple.Fill(ROOT.gRandom.Gaus(-1,1), # x
ROOT.gRandom.Gaus(-1,1), # y
0) # background
Plot the distribution of the generated events
samplesCanvas = rootnotes.canvas("samplesCanvas", (400, 400))
# draw an empty 2D histogram for the axes
histo = ROOT.TH2F("histo","",1,-5,5,1,-5,5)
histo.Draw()
# cuts defining the signal and background sample
sigCut = ROOT.TCut("signal > 0.5")
bgCut = ROOT.TCut("signal <= 0.5")
# draw the signal events in red
ntuple.SetMarkerColor(ROOT.kRed)
ntuple.Draw("y:x",sigCut,"same")
# draw the background events in blue
ntuple.SetMarkerColor(ROOT.kBlue)
ntuple.Draw("y:x",bgCut,"same")
samplesCanvas
10000L
Create the TMVA factory, the basic component of TMVA for the training of multivariate methods.
ROOT.TMVA.Tools.Instance()
# note that it seems to be mandatory to have an
# output file, just passing None to TMVA::Factory(..)
# does not work. Make sure you don't overwrite an
# existing file.
fout = ROOT.TFile("TMVAResults.root","RECREATE")
factory = ROOT.TMVA.Factory("TMVAClassification", fout,
":".join([
"!V",
"!Silent",
"!Color",
"!DrawProgressBar",
"Transformations=I;D;P;G,D",
"AnalysisType=Classification"]
))
Add the variables and trees to the factory.
factory.AddVariable("x","F")
factory.AddVariable("y","F")
factory.AddSignalTree(ntuple)
factory.AddBackgroundTree(ntuple)
factory.PrepareTrainingAndTestTree(sigCut, # signal events
bgCut, # background events
":".join([
"nTrain_Signal=0",
"nTrain_Background=0",
"SplitMode=Random",
"NormMode=NumEvents",
"!V"
]))
Book the multivariate method to train. In this case we are training a BDT and we specify the settings:
methodBDT = factory.BookMethod(ROOT.TMVA.Types.kBDT, "BDT",
":".join([
"!H",
"!V",
"NTrees=850",
"nEventsMin=150",
"MaxDepth=3",
"BoostType=AdaBoost",
"AdaBoostBeta=0.5",
"SeparationType=GiniIndex",
"nCuts=20",
"PruneMethod=NoPruning",
]))
Everything is setup, now train, test and evaluate all methods (the BDT in this case).
factory.TrainAllMethods()
factory.TestAllMethods()
factory.EvaluateAllMethods()
Plot the results of the training.
reader = ROOT.TMVA.Reader()
import array
varx = array.array('f',[0]) ; reader.AddVariable("x",varx)
vary = array.array('f',[0]) ; reader.AddVariable("y",vary)
reader.BookMVA("BDT","weights/TMVAClassification_BDT.weights.xml")
# create a new 2D histogram with fine binning
histo2 = ROOT.TH2F("histo2","",200,-5,5,200,-5,5)
# loop over the bins of a 2D histogram
for i in range(1,histo2.GetNbinsX() + 1):
for j in range(1,histo2.GetNbinsY() + 1):
# find the bin center coordinates
varx[0] = histo2.GetXaxis().GetBinCenter(i)
vary[0] = histo2.GetYaxis().GetBinCenter(j)
# calculate the value of the classifier
# function at the given coordinate
bdtOutput = reader.EvaluateMVA("BDT")
# set the bin content equal to the classifier output
histo2.SetBinContent(i,j,bdtOutput)
c2 = rootnotes.canvas("test2", (800, 400))
c2.Divide(2,1)
c2.cd(1)
histo2.Draw("colz")
# keeps objects otherwise removed by garbage collected in a list
gcSaver = []
# draw sigma contours around means
for mean, color in (
((1,1), ROOT.kRed), # signal
((-1,-1), ROOT.kBlue), # background
):
# draw contours at 1 and 2 sigmas
for numSigmas in (1,2):
circle = ROOT.TEllipse(mean[0], mean[1], numSigmas)
circle.SetFillStyle(0)
circle.SetLineColor(color)
circle.SetLineWidth(2)
circle.Draw()
gcSaver.append(circle)
# fill histograms for signal and background from the test sample tree
hSig = ROOT.TH1F("hSig", "hSig", 22, -1.1, 1.1)
hBg = ROOT.TH1F("hBg", "hBg", 22, -1.1, 1.1)
ROOT.TestTree.Draw("BDT>>hSig","classID == 0","goff") # signal
ROOT.TestTree.Draw("BDT>>hBg","classID == 1", "goff") # background
hSig.SetLineColor(ROOT.kRed); hSig.SetLineWidth(2) # signal histogram
hBg.SetLineColor(ROOT.kBlue); hBg.SetLineWidth(2) # background histogram
# use a THStack to show both histograms
hs = ROOT.THStack("hs","")
hs.Add(hSig)
hs.Add(hBg)
c2.cd(2)
hs.Draw()
c2
Question 1: Inspect the results saved in the TMVAResults.root output file. Plot the correlation matrix for signal and background. Plot also the BDT output. Before loading the file we need to close it. The ROOT.TestTree will not be accessible anymore as it will be deleted from memory. To rerun the cell above you will need to run from the start.
%%rootprint
fout.Close()
tmvaFile = ROOT.TFile("TMVAResults.root")
tmvaFile.ls()
Uncomment and run the following cell to see a suggestion.
# print open("TMVASuggestionQuestion1.txt").read()
Question 2: Train also a neural network (NN) and show the performance on this dataset.
The NN is defined as kMLP (multilayer perceptron) and can be specified as method = factory.BookMethod(ROOT.TMVA.Types.kMLP, "MLP", ...). Some reasonable default options are "H:!V:NeuronType=tanh:VarTransform=N:NCycles=600:HiddenLayers=N+5:TestRate=5:!UseRegulator" (to be converted to a proper format)
Plot the output of the NN and the ROC curve. Which of the two methods gives a better performance?
It is best to restart the kernel (Kernel -> restart) and run over from the first cell after modifying the code for the MLP training.
Uncomment and run the following cell to see a suggestion.
# print open("TMVASuggestionQuestion2.txt").read()