This IPython notebook is an introduction to the LSST DM stack with a focus on the pipe_base and pipe_task module for making pipeline tasks that run as a program on the command line.
Here I will try to show how they work by building up an example program based on what we did before step by step.
Let's start with our code from before but turn it into a function, compressing it down to essentials.
import lsst.afw.math as math
import lsst.afw.table as afwTable
import lsst.afw.image as afwImg
import lsst.meas.algorithms as measAlg
def processTest():
"""This function reads sensor data from PhoSim and processes it."""
exposure = afwImg.ExposureF('lsst_e_99999999_f2_R22_S11_E000.fits.gz')
maskedImage = exposure.getMaskedImage()
image = maskedImage.getImage()
mask = maskedImage.getMask()
variance = maskedImage.getVariance()
statFlags = math.NPOINT | math.MEAN | math.STDEV | math.MAX | math.MIN | math.ERRORS
control = math.StatisticsControl()
imageStatistics = math.makeStatistics(maskedImage, statFlags, control)
numBins = imageStatistics.getResult(math.NPOINT)[0]
mean = imageStatistics.getResult(math.MEAN)[0]
print "The image has dimensions %i x %i pixels" %(maskedImage.getWidth(), maskedImage.getHeight())
print "Number of analyzed bins in image is %i" %numBins
print "Max = %9d" %imageStatistics.getResult(math.MAX)[0]
print "Min = %9d" %imageStatistics.getResult(math.MIN)[0]
print "Mean = %9.8f +- %3.1f" %imageStatistics.getResult(math.MEAN)
print "StdDev = %9.2f" %imageStatistics.getResult(math.STDEV)[0]
# Configure the detection and measurement algorithms
schema = afwTable.SourceTable.makeMinimalSchema()
detectSourcesConfig = measAlg.SourceDetectionConfig(thresholdType='value')
measureSourcesConfig = measAlg.SourceMeasurementConfig()
# Setup the detection and measurement tasks
detect = measAlg.SourceDetectionTask (config=detectSourcesConfig, schema=schema)
measure = measAlg.SourceMeasurementTask(config=measureSourcesConfig, schema=schema)
# Detect the sources,then put them into a catalog (the table is where the catalog atually stores stuff)
table = afwTable.SourceTable.make(schema)
catalog = detect.makeSourceCatalog(table, exposure, sigma=5)
# Get the sources out of the catalog and apply Measurement routines
sources = catalog.sources
measure.run(exposure, sources)
# Now let's look at the output from some of the measurment algorithms.
fields = ['centroid.sdss', 'shape.sdss','flux.gaussian']
keys = [schema.find(f).key for f in fields]
print "\nSources Found:"
for source in sources:
print "\tSource ", source.get('id')
for f,k in zip(fields, keys):
print "\t",f, source.get(k)
Now run the function
processTest()
The image has dimensions 4000 x 4072 pixels Number of analyzed bins in image is 16288000 Max = 1103 Min = 0 Mean = 0.00156667 +- 0.0 StdDev = 0.87 Sources Found: Source 1 centroid.sdss (1999.5, 2032.5) shape.sdss (ixx=2.88751926897, iyy=3.05814257946, ixy=-0.133327694904) flux.gaussian 21410.8483803
Now we are going to put this code in a class. This will allow us to split the various pieces up into member functions. There is no difference here compared to what I did before. It is just organized differently.
import lsst.afw.math as math
import lsst.afw.table as afwTable
import lsst.afw.image as afwImg
import lsst.meas.algorithms as measAlg
class ProcessTest():
"""This class reads sensor data from PhoSim and processes it."""
def readFile(self):
"""Read the file and make the exposure etc"""
self.exposure = afwImg.ExposureF('lsst_e_99999999_f2_R22_S11_E000.fits.gz')
self.maskedImage = self.exposure.getMaskedImage()
def doStats(self):
"""Do pixel based processing"""
statFlags = math.NPOINT | math.MEAN | math.STDEV | math.MAX | math.MIN | math.ERRORS
control = math.StatisticsControl()
imageStatistics = math.makeStatistics(self.maskedImage, statFlags, control)
numBins = imageStatistics.getResult(math.NPOINT)[0]
mean = imageStatistics.getResult(math.MEAN)[0]
print "The image has dimensions %i x %i pixels" %(self.maskedImage.getWidth(),
self.maskedImage.getHeight())
print "Number of analyzed bins in image is %i" %numBins
print "Max = %9d" %imageStatistics.getResult(math.MAX)[0]
print "Min = %9d" %imageStatistics.getResult(math.MIN)[0]
print "Mean = %9.8f +- %3.1f" %imageStatistics.getResult(math.MEAN)
print "StdDev = %9.2f" %imageStatistics.getResult(math.STDEV)[0]
def detectAndMeasureSources(self):
"""Find and measure the sources"""
# Configure the detection and measurement algorithms
schema = afwTable.SourceTable.makeMinimalSchema()
detectSourcesConfig = measAlg.SourceDetectionConfig(thresholdType='value')
measureSourcesConfig = measAlg.SourceMeasurementConfig()
# Setup the detection and measurement tasks
detect = measAlg.SourceDetectionTask (config=detectSourcesConfig, schema=schema)
measure = measAlg.SourceMeasurementTask(config=measureSourcesConfig, schema=schema)
# Detect the sources,then put them into a catalog (the table is where the catalog atually stores stuff)
table = afwTable.SourceTable.make(schema)
catalog = detect.makeSourceCatalog(table, self.exposure, sigma=5)
# Get the sources out of the catalog and apply Measurement routines
sources = catalog.sources
measure.run(self.exposure, sources)
# Now let's look at the output from some of the measurment algorithms.
fields = ['centroid.sdss', 'shape.sdss','flux.gaussian']
keys = [schema.find(f).key for f in fields]
print "\nSources Found:"
for source in sources:
print "\tSource ", source.get('id')
for f,k in zip(fields, keys):
print "\t",f, source.get(k)
def run(self):
self.readFile()
self.doStats()
self.detectAndMeasureSources()
Now run the code. Note carefully how this is done. First we have to make an instance of the ProcessTestStandSensor class. That is why there is a "()" and then we have to call the run member function.
ProcessTest().run()
The image has dimensions 4000 x 4072 pixels Number of analyzed bins in image is 16288000 Max = 1103 Min = 0 Mean = 0.00156667 +- 0.0 StdDev = 0.87 Sources Found: Source 1 centroid.sdss (1999.5, 2032.5) shape.sdss (ixx=2.88751926897, iyy=3.05814257946, ixy=-0.133327694904) flux.gaussian 21410.8483803
So, we get the same answer as before.
Now we want to do the same thing but we are going to inherit the class we make from a DM class that knows how to get data, read command line arguments, configure things etc. We are going to start with the most basic class for this: CmdLineTask. To make it easier to understand, let's start by just making a skeleton of what we want to do.
import lsst.pex.config as pexConfig
import lsst.pipe.base as pipeBase
class ProcessTestConfig(pexConfig.Config):
"""Config for ProcessTest"""
class ProcessTestTask(pipeBase.CmdLineTask):
ConfigClass = ProcessTestConfig
_DefaultName = "processTest"
def _getConfigName(self):
return None
def _getMetadataName(self):
return None
def run(self, sensorRef):
print "HELLO!"
self.log.info("Processing %s" % (sensorRef.dataId))
output = ProcessTestTask.parseAndRun(args=['/Users/walter/LSST/iPython/testDir/imSim/PT1.2','--id','visit=99999999'])
HELLO!
There are several things to note here. First of all you see I had to make a ProcessTestConfig class. It inherits from pexConfig.Config. Then I also had to make a task (which I called ProcessTestTask) which inherits from pipeBase.CmdLineTask.
Those two classes give me the basics I need to read data, configure algorithms etc.
Next, you notice I had to:
Tell ProcessTestTask which class to use to configure itself and give it a name.
Add a few functions which return None. This is due to a present inability of DM to deal with standalone Command line tasks which aren't in the core code. You can read this thread for more information.
Add a run routine like before that will do the work.
But, look closely at the run routine. It takes a sensorRef which is a reference to data returned by a dataButler which is a way of abstracting the access to the data so you don't refer directly to file names etc.
Look here to learn more about the dataButler.
Also, look at how the test is run with the "ParseAndRun" routine. ParseAndRun comes from the base class CmdLineTask we inherited from.
Notice there is no "()" after ProcessTestTask. That is because we are not instantiating it. It is a static class member function (in C++ language) and there is only one instance of it for all instances of the class that use it. So, we don't need to make one.
Next notice the argument list. First of all, if it was empty ParseAndRun would use the standard command line arguments so if I had a file processTest.py with ProcessTestTask.parseAndRun() inside of it
I could just call it like this from the commandline:
python ProcessTest.py /Users/walter/LSST/iPython/testDir/imSim/PT1.2 --Id visit=99999999
What happens when I run this is that it takes the information on the command line, produces a dataButler and then gives us a reference to it. That is what the sensorRef is all about. You should follow the instruction (there is a link on the workbook web site) that explaines what to do to get yourfiles into the correct directory structure.
Now, let's take the program we had before and wrap it in a commandline task.
import lsst.afw.math as math
import lsst.afw.table as afwTable
import lsst.afw.image as afwImg
import lsst.meas.algorithms as measAlg
import lsst.pex.config as pexConfig
import lsst.pipe.base as pipeBase
class ProcessTestConfig(pexConfig.Config):
"""Config for ProcessTest"""
class ProcessTest(pipeBase.CmdLineTask):
"""This class reads sensor data from PhoSim."""
ConfigClass = ProcessTestConfig
_DefaultName = "processTest"
dataPrefix = ""
def _getConfigName(self):
return None
def _getMetadataName(self):
return None
def readFile(self, sensorRef):
"""Read the file and make the exposure etc"""
print "Reading %s"%(sensorRef.dataId)
print sensorRef
self.exposure = sensorRef.get('eimage')
self.maskedImage = self.exposure.getMaskedImage()
self.image = self.maskedImage.getImage()
self.mask = self.maskedImage.getMask()
self.variance = self.maskedImage.getVariance()
def doStats(self):
"""Do pixel based processing"""
statFlags = math.NPOINT | math.MEAN | math.STDEV | math.MAX | math.MIN | math.ERRORS
control = math.StatisticsControl()
imageStatistics = math.makeStatistics(self.maskedImage, statFlags, control)
numBins = imageStatistics.getResult(math.NPOINT)[0]
mean = imageStatistics.getResult(math.MEAN)[0]
print "The image has dimensions %i x %i pixels" %(self.maskedImage.getWidth(),
self.maskedImage.getHeight())
print "Number of analyzed bins in image is %i" %numBins
print "Max = %9d" %imageStatistics.getResult(math.MAX)[0]
print "Min = %9d" %imageStatistics.getResult(math.MIN)[0]
print "Mean = %9.8f +- %3.1f" %imageStatistics.getResult(math.MEAN)
print "StdDev = %9.2f" %imageStatistics.getResult(math.STDEV)[0]
def detectAndMeasureSources(self):
"""Find and measure the sources"""
# Configure the detection and measurement algorithms
schema = afwTable.SourceTable.makeMinimalSchema()
detectSourcesConfig = measAlg.SourceDetectionConfig(thresholdType='value')
measureSourcesConfig = measAlg.SourceMeasurementConfig()
# Setup the detection and measurement tasks
detect = measAlg.SourceDetectionTask (config=detectSourcesConfig, schema=schema)
measure = measAlg.SourceMeasurementTask(config=measureSourcesConfig, schema=schema)
# Detect the sources,then put them into a catalog (the table is where the catalog atually stores stuff)
table = afwTable.SourceTable.make(schema)
catalog = detect.makeSourceCatalog(table, self.exposure, sigma=5)
# Get the sources out of the catalog and apply Measurement routines
sources = catalog.sources
measure.run(self.exposure, sources)
# Now let's look at the output from some of the measurment algorithms.
fields = ['centroid.sdss', 'shape.sdss','flux.gaussian']
keys = [schema.find(f).key for f in fields]
print "\nSources Found:"
for source in sources:
print "\tSource ", source.get('id')
for f,k in zip(fields, keys):
print "\t",f, source.get(k)
def run(self, sensorRef):
self.readFile(sensorRef)
self.doStats()
self.detectAndMeasureSources()
Now let's run it again. It should actually do exactly the same thing that I did before. I made two changes in how I accessed the data. I passed the sensorRef to my read routine, printed it out and then used it to open the file which exists in the directory structure. This may not seem like a big difference, but in principle now I could open the data from somwhere else on the net, or from another type of camera with a different naming structure etc, without changing my code.
dDir = '/Users/walter/LSST/iPython/testDir/imSim/PT1.2'
output = ProcessTest.parseAndRun(args=[dDir,'--id','visit=99999999'])
Reading {'filter': 'r', 'sensor': '1,1', 'visit': 99999999, 'raft': '2,2'} <lsst.daf.persistence.butlerSubset.ButlerDataRef object at 0x10c80b910> The image has dimensions 4000 x 4072 pixels Number of analyzed bins in image is 16288000 Max = 1103 Min = 0 Mean = 0.00156667 +- 0.0 StdDev = 0.87 Sources Found: Source 1 centroid.sdss (1999.5, 2032.5) shape.sdss (ixx=2.88751926897, iyy=3.05814257946, ixy=-0.133327694904) flux.gaussian 21410.8483803
OK, finally, we are going to look at how routines like processCcd.py work. They don't inherit from CmdLineTask. They inherit from ProcessImage. ProcessImage inherits from CmdLineTask. This means we get all of the stuff from before + new routines in that class. The thing that is added in ProcessImage is all of the basic routines for sensor cleanup detection and measurement. To use it requires setting some config parameters which I won't do here but let me show you how it works.
import lsst.afw.math as math
import lsst.afw.table as afwTable
import lsst.afw.image as afwImg
import lsst.meas.algorithms as measAlg
import lsst.pex.config as pexConfig
import lsst.pipe.base as pipeBase
from lsst.pipe.tasks.processImage import ProcessImageTask
class ProcessTestConfig(ProcessImageTask.ConfigClass):
"""Config for ProcessTest"""
class ProcessTest(ProcessImageTask):
"""This class reads sensor data from either PhoSim or a teststand setup and
processes it."""
ConfigClass = ProcessTestConfig
_DefaultName = "processTest"
dataPrefix = ""
def _getConfigName(self):
return None
def _getMetadataName(self):
return None
#necessary for ProcessImage
def makeIdFactory(self, sensorRef):
return None
def readFile(self, sensorRef):
"""Read the file and return the exposure"""
print "Reading %s"%(sensorRef.dataId)
print sensorRef
return sensorRef.get('eimage')
def doStats(self, maskedImage):
"""Do pixel based processing"""
statFlags = math.NPOINT | math.MEAN | math.STDEV | math.MAX | math.MIN | math.ERRORS
control = math.StatisticsControl()
imageStatistics = math.makeStatistics(maskedImage, statFlags, control)
numBins = imageStatistics.getResult(math.NPOINT)[0]
mean = imageStatistics.getResult(math.MEAN)[0]
print "The image has dimensions %i x %i pixels" %(maskedImage.getWidth(),
maskedImage.getHeight())
print "Number of analyzed bins in image is %i" %numBins
print "Max = %9d" %imageStatistics.getResult(math.MAX)[0]
print "Min = %9d" %imageStatistics.getResult(math.MIN)[0]
print "Mean = %9.8f +- %3.1f" %imageStatistics.getResult(math.MEAN)
print "StdDev = %9.2f" %imageStatistics.getResult(math.STDEV)[0]
def detectAndMeasureSources(self):
"""Find and measure the sources"""
# Let processImage do the work!
result = self.process(sensorRef, exposure)
def run(self, sensorRef):
exposure = self.readFile(sensorRef)
self.doStats(exposure.getMaskedImage())
self.detectAndMeasureSources(sensorRef, exposure)
I'm not going to run this, since all of the commandline options and Config settings are awkward to do in the notebook but you can see how it works. Inherit from Process Image (which inherits from CmdLineTask), do all the extra stuff you want to do, then let ProcessImage.process do the detection and measurment etc.
This should give you enough of an understanding of the structure of a pipe_task to look at the internals of one, or write one yourself.
So now, when you see something like:
processCcdSdss.py sdss /lsst7/stripe82/dr7/runs --id run=1033 camcol=2 field=111 filter=g --output /nfs/lsst7/stripe82/dr7-coadds/v1/run2
you have an idea of what is going on.
One other thing for understanding the examples actually distributed with DM: A binary like processCcdSdss.py is in the package "bin" directory. It loads the class which is located in the package's "python" directory unlike my monolithic examples above.