Expression Probe Set Structure

Expression Probe Set Structure

Understanding Expression Probe Set Structure


Probe Set

The chip design strategy for expression probe sets is to use a set of PM/MM probe pairs to interrogate a target transcript.

Within a probe set are probe groups (also known as blocks). The CCDFProbeGroupInformation class in the File SDK is used to represent a group. Within an expression probe set there is one and only one group.

Probe Pairs

The order of the probe pairs in the CDF probe set are not guaranteed to be in genomic order. The CDF file does not contain any information as to the genomic position of the probe pairs.

Use the TBase and PBase entry of the probe to determine which probe is the PM and MM probe. The probe is the PM probe when the compliment of the PBase is equal to the TBase. Case insensitive comparisons should also be performed.

Description

The sample code is written in C++ and using the GCOS 1.x File SDK. The code implements a command line program which outputs the PM and MM intensities for all probe pairs in a probe set.

Source Code

#include "CELFileData.h"
#include "CDFFileData.h"
#include <string>
#include <vector>
#include <iostream>

using namespace affxcel;
using namespace affxcdf;
using namespace std;


/*! Determines if the probe is a PM probe.
 * @param cell The probe to test.
 * @return True if PM, false otherwise.
 */

bool IsPerfectMatch(const CCDFProbeInformation &cell)
{
   char pbase = tolower(cell.GetPBase());
   char tbase = tolower(cell.GetTBase());
   return (
      (pbase == 'a' && tbase == 't') || (pbase == 't' && tbase == 'a') ||
      (pbase == 'g' && tbase == 'c') || (pbase == 'c' && tbase == 'g')
   );
}

/*! This class contains functions to extract intensity data of a probe set. */
class ExtractProbeData
{
public:
   /*! Constructor */
   ExtractProbeData() { }

   /*! Destructor. */
   ~ExtractProbeData() { }

   /*! A structure to hold intensity data for a single probe pair. */
   typedef struct _ProbeData
   {
      float pm; /*! The PM probe intensity. */
      float mm; /*! The PM probe intensity. */
   } ProbeData;

private:
   /*! The CDF file object. */
   CCDFFileData cdfFile;

   /*! The probe set object for the desired probe set. */
   CCDFProbeSetInformation probeSet;

   /*! The CEL file object. */
   CCELFileData celFile;

   /*! Reads the CDF file.
 
   * @param cdf The name of the CDF file.
 
   * @return True if successful.
 
   */
   bool ReadCDF(const string &cdf)
   {
      cdfFile.Close();
      cdfFile.SetFileName(cdf.c_str());
      if (cdfFile.Exists() == false)
         return false;
      cdfFile.Read();
      return true;
   }

   /*! Reads the CEL file.
 
   * @param cdf The name of the CEL file.
 
   * @return True if successful.
 
   */
   bool ReadCEL(const string &cel)
   {
      celFile.Close();
      celFile.SetFileName(cel.c_str());
      if (celFile.Exists() == false)
         return false;
      celFile.Read();
      return true;
   }

   /*! Finds the probe set object for the given probe set.
 
   * @param name The name of the probe set.
 
   * @return True if successful.
 
   */
   bool ExtractSet(const string &name)
   {
      CCDFFileHeader &header = cdfFile.GetHeader();
      int nsets = header.GetNumProbeSets();
      for (int i=0; i<nsets; i++)
      {
         if (name == cdfFile.GetProbeSetName(i))
         {
            cdfFile.GetProbeSetInformation(i, probeSet);
            return true;
         }
      }
      return false;
   }

   /*! Extracts the intensities for the probe pairs in a probe set.
 
   * @param intensities The CEL file intensities associated with the probe set.
 
   * @return True if successful.
 
   */
   bool ExtractIntensities(vector<ProbeData> &intensities)
   {
      // Get the one and only group.
      CCDFProbeGroupInformation group;
      probeSet.GetGroupInformation(0, group);

      // Get the number of probe pairs and allocate memory for the results.
      int nProbePairs = group.GetNumLists();
      intensities.resize(nProbePairs);

      // Get each pair of probes and store the results in the intensities object.
      const int TWO_PROBES_PER_PAIR = 2;
      CCDFProbeInformation cell[TWO_PROBES_PER_PAIR];
      float inten[TWO_PROBES_PER_PAIR];
      for (int i=0, icel=0; i<nProbePairs; i++, icel+=TWO_PROBES_PER_PAIR)
      {
         group.GetCell(icel, cell[0]);
         group.GetCell(icel+1, cell[1]);
         inten[0] = celFile.GetIntensity(cell[0].GetX(), cell[0].GetY());
         inten[1] = celFile.GetIntensity(cell[1].GetX(), cell[1].GetY());

         if (IsPerfectMatch(cell[0]) == true)
         {
            intensities[i].pm = inten[0];
            intensities[i].mm = inten[1];
         }
         else
         {
            intensities[i].mm = inten[0];
            intensities[i].pm = inten[1];
         }
      }
      return true;
   }

public:

   /*! Extracts the intensity for the given probe set.
    * @param cel The name of the CEL file.
    * @param cdf The name of the CDF file.
    * @param name The name of the probe set.
    * @param intensities The CEL file intensities associated with the probe set.
    * @return True if successful.
    */

   bool ExtractProbePairIntensities(const string &cel, const string &cdf, const string &name, vector<ProbeData> &intensities)
   {
      intensities.clear();

      if (ReadCDF(cdf) == false)
         return false;

      if (ExtractSet(name) == false)
         return false;

      if (ReadCEL(cel) == false)
        return false;

      if (ExtractIntensities(intensities) == false)
         return false;

      return  true;
   }
};

/*! Gets a file from the command line argument.
 * @param argc The number of arguments in the argv array.
 * @param argv The command line arguments.
 * @return The file.
 */

string get_file(int argc, char *argv[], int &i)
{
   string file = "";

   // Get the file name.
   int j=i+1;
   while (j<argc && argv[j][0] != '-')
   {
      file += argv[j];
      if (j<argc-1 && argv[j+1][0] != '-')
         file += " ";
      ++j;
   }
   i = j-1;

   return file;
}

/*! This example program will extract intensity data from a CEL file for an associated probe set.
 * @param argc The number of arguments in the argv array.
 * @param argv The command line arguments.
 * @return The status.
 */

int main(int argc, char **argv)
{
   // Show the command line arguments.
   if (argc == 1)
   {
      cout << "Synopsis: <-cel cel_file_name> <-cdf cdf_file_name> <-ps probe_set_name>" << endl;
      return 0 ;
   }

   // Get the command line arguments.
   int i=1;
   string cel;
   string cdf;
   string name;
   while(i<argc)
   {
      if (strcmp(argv[i], "-cel") == 0)
         cel = get_file(argc, argv, i);

      else if (strcmp(argv[i], "-cdf") == 0)
         cdf = get_file(argc, argv, i);

      else if (strcmp(argv[i], "-ps") == 0)
         name = argv[i+1];

      ++i;
   }

   // Extract the data and print the results.
   ExtractProbeData ext;
   vector<ExtractProbeData::ProbeData> inten;
   if (ext.ExtractProbePairIntensities(cel, cdf, name, inten) == true)
   {
      for (int i=0; i<(int)inten.size(); i++)
      {
         cout << "[" << i << "]" << endl;
         cout << "pm=" << inten[i].pm << endl;
         cout << "mm=" << inten[i].mm << endl;
      }
   }

   return 0;
}

Back to Top >