Scientific Publications
Understanding Expression Probe Set Structure
Probe Set
The chip design strategy for expression probe sets is to use a set of PM/MM probe pairs to interrogate a target transcript.
Within a probe set are probe groups (also known as blocks). The CCDFProbeGroupInformation class in the File SDK is used to represent a group. Within an expression probe set there is one and only one group.
Probe Pairs
The order of the probe pairs in the CDF probe set are not guaranteed to be in genomic order. The CDF file does not contain any information as to the genomic position of the probe pairs.
Use the TBase and PBase entry of the probe to determine which probe is the PM and MM probe. The probe is the PM probe when the compliment of the PBase is equal to the TBase. Case insensitive comparisons should also be performed.
Description
The sample code is written in C++ and using the GCOS 1.x File SDK. The code implements a command line program which outputs the PM and MM intensities for all probe pairs in a probe set.
Source Code
#include "CELFileData.h"
#include "CDFFileData.h"
#include <string>
#include <vector>
#include <iostream>
using namespace affxcel;
using namespace affxcdf;
using namespace std;
/*! Determines if the probe is a
PM probe.
* @param cell The probe to test.
* @return True if PM, false otherwise.
*/
bool IsPerfectMatch(const CCDFProbeInformation &cell)
{
char pbase = tolower(cell.GetPBase());
char tbase = tolower(cell.GetTBase());
return (
(pbase == 'a' && tbase == 't') || (pbase == 't'
&& tbase == 'a') ||
(pbase == 'g' && tbase == 'c') || (pbase == 'c'
&& tbase == 'g')
);
}
/*! This class contains functions
to extract intensity data of a probe set. */
class ExtractProbeData
{
public:
/*! Constructor */
ExtractProbeData() { }
/*! Destructor. */
~ExtractProbeData() { }
/*! A structure to hold
intensity data for a single probe pair. */
typedef struct _ProbeData
{
float pm; /*! The PM
probe intensity. */
float mm; /*! The PM
probe intensity. */
} ProbeData;
private:
/*! The CDF file object. */
CCDFFileData cdfFile;
/*! The probe set object for
the desired probe set. */
CCDFProbeSetInformation probeSet;
/*! The CEL file object. */
CCELFileData celFile;
/*! Reads the CDF file.
* @param cdf The name of the CDF file.
* @return True if successful.
*/
bool ReadCDF(const string &cdf)
{
cdfFile.Close();
cdfFile.SetFileName(cdf.c_str());
if (cdfFile.Exists() == false)
return false;
cdfFile.Read();
return true;
}
/*! Reads the CEL file.
* @param cdf The name of the CEL file.
* @return True if successful.
*/
bool ReadCEL(const string &cel)
{
celFile.Close();
celFile.SetFileName(cel.c_str());
if (celFile.Exists() == false)
return false;
celFile.Read();
return true;
}
/*! Finds the probe set object
for the given probe set.
* @param
name The name of the probe set.
* @return True if successful.
*/
bool ExtractSet(const string &name)
{
CCDFFileHeader &header = cdfFile.GetHeader();
int nsets = header.GetNumProbeSets();
for (int i=0; i<nsets; i++)
{
if (name == cdfFile.GetProbeSetName(i))
{
cdfFile.GetProbeSetInformation(i,
probeSet);
return true;
}
}
return false;
}
/*! Extracts the
intensities for the probe pairs in a probe set.
* @param intensities The CEL file intensities
associated with the probe set.
* @return True if successful.
*/
bool ExtractIntensities(vector<ProbeData>
&intensities)
{
// Get the one and only group.
CCDFProbeGroupInformation group;
probeSet.GetGroupInformation(0, group);
// Get the number of probe
pairs and allocate memory for the results.
int nProbePairs = group.GetNumLists();
intensities.resize(nProbePairs);
// Get each pair of probes and store the results in the intensities object.
const int TWO_PROBES_PER_PAIR = 2;
CCDFProbeInformation cell[TWO_PROBES_PER_PAIR];
float inten[TWO_PROBES_PER_PAIR];
for (int i=0, icel=0; i<nProbePairs; i++, icel+=TWO_PROBES_PER_PAIR)
{
group.GetCell(icel, cell[0]);
group.GetCell(icel+1, cell[1]);
inten[0] = celFile.GetIntensity(cell[0].GetX(),
cell[0].GetY());
inten[1] = celFile.GetIntensity(cell[1].GetX(),
cell[1].GetY());
if (IsPerfectMatch(cell[0]) == true)
{
intensities[i].pm = inten[0];
intensities[i].mm = inten[1];
}
else
{
intensities[i].mm = inten[0];
intensities[i].pm = inten[1];
}
}
return true;
}
public:
/*! Extracts the intensity for
the given probe set.
* @param cel The name of the CEL file.
* @param cdf The name of the CDF file.
* @param name The name of the
probe set.
* @param intensities The CEL file intensities
associated with the probe set.
* @return True if successful.
*/
bool ExtractProbePairIntensities(const string &cel, const
string &cdf, const string &name, vector<ProbeData>
&intensities)
{
intensities.clear();
if (ReadCDF(cdf) == false)
return false;
if (ExtractSet(name) == false)
return false;
if (ReadCEL(cel) == false)
return false;
if (ExtractIntensities(intensities) == false)
return false;
return true;
}
};
/*! Gets a file from the command
line argument.
* @param argc The number of arguments in the argv array.
* @param argv The command line arguments.
* @return The file.
*/
string get_file(int argc, char *argv[], int &i)
{
string file = "";
// Get the file name.
int j=i+1;
while (j<argc && argv[j][0] != '-')
{
file += argv[j];
if (j<argc-1 && argv[j+1][0] != '-')
file += " ";
++j;
}
i = j-1;
return file;
}
/*! This example program will
extract intensity data from a CEL file for an associated
probe set.
* @param argc The number of arguments in the argv array.
* @param argv The command line arguments.
* @return The status.
*/
int main(int argc, char **argv)
{
// Show the command line
arguments.
if (argc == 1)
{
cout << "Synopsis: <-cel cel_file_name> <-cdf
cdf_file_name> <-ps probe_set_name>" << endl;
return 0 ;
}
// Get the command line
arguments.
int i=1;
string cel;
string cdf;
string
name;
while(i<argc)
{
if (strcmp(argv[i], "-cel") == 0)
cel = get_file(argc, argv, i);
else if (strcmp(argv[i], "-cdf") == 0)
cdf = get_file(argc, argv, i);
else if (strcmp(argv[i], "-ps") == 0)
name = argv[i+1];
++i;
}
// Extract the data and print
the results.
ExtractProbeData ext;
vector<ExtractProbeData::ProbeData> inten;
if (ext.ExtractProbePairIntensities(cel, cdf,
name, inten)
== true)
{
for (int i=0; i<(int)inten.size(); i++)
{
cout << "[" << i << "]" << endl;
cout << "pm=" << inten[i].pm << endl;
cout << "mm=" << inten[i].mm << endl;
}
}
return 0;
}


