Scientific Publications
Understanding Genotyping Probe Set Structure
Probe Set
The chip design strategy for genotyping probe sets is to use a set of PM/MM probe pairs to interrogate the surrounding bases of the SNP for the forward and or reverse target for both the A and B alleles.
The CDF file defines a grouping of PM/MM probe pairs by direction and allele. Genotyping probe sets typically contain 2 or 4 of these groups.
The 4 group probe set is defined by:
- Group 1 - probes interrogating the X* direction of the A allele target.
- Group 2 - probes interrogating the X* direction of the B allele target.
- Group 3 - probes interrogating the Y* direction of the A allele target.
- Group 4 - probes interrogating the Y* direction of the B allele target.
*The direction is determined by the direction attribute of the group.
The 2 group probe set is defined by:
- Group 1 - probes interrogating the X direction of the A allele target.
- Group 2 - probes interrogating the X direction of the B allele target.
Where X is either the forward or reverse direction, determined by the direction attribute of the group.
Note: The CCDFProbeGroupInformation class in the File SDK is used to represent a group. In the CDF file, the group is referred to as a "block".
There are some probe sets in the CDF file which may contain a single group. These are control probe sets which are not analyzed in the GDAS/GTYPE software. These probe sets can also be identified by the probe set name beginning with "AFFX".
Group
Within a group there are probes to interrogate the neighboring bases of the SNP's position. A probe pair to interrogate the SNP position is usually, but not always, within the probe group.
Use the TBase entry of the probe to determine which probe pair is designed to interrogate the SNP base. When the TBase of the A allele and B allele probes are not the same then the probe pair is the one designed to interrogate the SNP base.
To determine the offset (the base position being interrogated by the probes relative to the SNP position) of the other probes, subtract the Expos value of the probe interrogating the SNP base from the probes Expos value. Typically you will see probes that interrogate the bases which are plus and minus 4, 2 and 1 bases from the SNP position.
Description
The sample code is written in C++ and using the GCOS 1.x File SDK. The code implements a command line program which outputs the PM and MM intensities associated with the probes interrogating one direction of the A and B alleles. The intensities for the probes interrogating the other direction is not provided as this is solely meant as an example for demonstrative purposes only.
Source Code
#include "CELFileData.h"
#include "CDFFileData.h"
#include <string>
#include <vector>
#include <iostream>
using namespace affxcel;
using namespace affxcdf;
using namespace std;
/*! Determines if the probe is a
PM probe.
* @param cell The probe to test.
* @return True if PM, false otherwise.
*/
bool IsPerfectMatch(const CCDFProbeInformation &cell)
{
char pbase = tolower(cell.GetPBase());
char tbase = tolower(cell.GetTBase());
return (
(pbase == 'a' && tbase == 't') || (pbase == 't'
&& tbase == 'a') ||
(pbase == 'g' && tbase == 'c') || (pbase == 'c'
&& tbase == 'g')
);
}
/*! This class contains functions
to extract intensity data associated with a SNP. */
class ExtractSNP
{
public:
/*! Constructor */
ExtractSNP() { }
/*! Destructor. */
~ExtractSNP() { }
/*! A structure to hold
intensity data for a single target base of the A and B
alleles (one direction only). */
typedef struct _SNPData
{
float pmA; /*! The PM
probe for the A allele. */
float mmA; /*! The PM
probe for the A allele. */
float pmB; /*! The MM
probe for the B allele. */
float mmB; /*! The MM
probe for the B allele. */
int pos; /*! The
position of the interrogation base relative to the snp
position, or the position from the CDF file. */
bool rel; /*! True
if the pos is a relative position, false if the pos is
just the position from the CDF file. */
} SNPData;
private:
/*! The CDF file object. */
CCDFFileData cdfFile;
/*! The probe set object for
the desired SNP. */
CCDFProbeSetInformation probeSet;
/*! The CEL file object. */
CCELFileData celFile;
/*! Reads the CDF file.
* @param cdf The name of the CDF file.
* @return True if successful.
*/
bool ReadCDF(const string &cdf)
{
cdfFile.Close();
cdfFile.SetFileName(cdf.c_str());
if (cdfFile.Exists() == false)
return false;
cdfFile.Read();
return true;
}
/*! Reads the CEL file.
* @param cdf The name of the CEL file.
* @return True if successful.
*/
bool ReadCEL(const string &cel)
{
celFile.Close();
celFile.SetFileName(cel.c_str());
if (celFile.Exists() == false)
return false;
celFile.Read();
return true;
}
/*! Finds the probe set object
for the given SNP.
* @param snp The name of the SNP.
* @return True if successful.
*/
bool ExtractSet(const string &snp)
{
CCDFFileHeader &header = cdfFile.GetHeader();
int nsets = header.GetNumProbeSets();
for (int i=0; i<nsets; i++)
{
if (snp == cdfFile.GetProbeSetName(i))
{
cdfFile.GetProbeSetInformation(i,
probeSet);
return true;
}
}
return false;
}
/*! Extracts the intensities
for a single direction probes.
* @param intensities The CEL file intensities
associated with the snp.
* @return True if successful.
*/
bool ExtractIntensities(vector<SNPData>
&intensities)
{
if (probeSet.GetNumGroups() < 2)
return false;
// Get the first A and B groups.
The first group is the A group.
// This example only extracts data
for one direction, repeat
// the code for the other
direction.
CCDFProbeGroupInformation groupA;
CCDFProbeGroupInformation groupB;
probeSet.GetGroupInformation(0, groupA);
probeSet.GetGroupInformation(1, groupB);
// Get the number of probe
pairs and allocate memory for the results.
// These are the same for A and B groups.
int nProbePairs = groupA.GetNumLists();
intensities.resize(nProbePairs);
// Get each pair of probes
for the A and B alleles and store the results
// in the intensities object.
// If the TBase values are the different from
the A and B then this is the
// probe which interrogates the SNP location.
The position values will be adjusted
// based on this position.
const int TWO_PROBES_PER_PAIR = 2;
CCDFProbeInformation cellA[TWO_PROBES_PER_PAIR];
CCDFProbeInformation cellB[TWO_PROBES_PER_PAIR];
float intenA[TWO_PROBES_PER_PAIR];
float intenB[TWO_PROBES_PER_PAIR];
int snpPos = -1;
for (int i=0, icel=0; i<nProbePairs; i++, icel+=TWO_PROBES_PER_PAIR)
{
groupA.GetCell(icel, cellA[0]);
groupA.GetCell(icel+1, cellA[1]);
intenA[0] = celFile.GetIntensity(cellA[0].GetX(),
cellA[0].GetY());
intenA[1] = celFile.GetIntensity(cellA[1].GetX(),
cellA[1].GetY());
groupB.GetCell(icel, cellB[0]);
groupB.GetCell(icel+1, cellB[1]);
intenB[0] = celFile.GetIntensity(cellB[0].GetX(),
cellB[0].GetY());
intenB[1] = celFile.GetIntensity(cellB[1].GetX(),
cellB[1].GetY());
if (IsPerfectMatch(cellA[0]) == true)
{
intensities[i].pmA = intenA[0];
intensities[i].mmA = intenA[1];
}
else
{
intensities[i].mmA = intenA[0];
intensities[i].pmA = intenA[1];
}
if (IsPerfectMatch(cellB[0]) == true)
{
intensities[i].pmB = intenB[0];
intensities[i].mmB = intenB[1];
}
else
{
intensities[i].mmB = intenB[0];
intensities[i].pmB = intenB[1];
}
intensities[i].pos = cellA[0].GetExpos();
intensities[i].rel = false;
if (cellA[0].GetTBase() != cellB[0].GetTBase())
snpPos = cellA[0].GetExpos();
}
// Go back and adjust the
positions relative to the SNP position.
for (int i=0; i<nProbePairs && snpPos != -1; i++)
{
intensities[i].pos -= snpPos;
intensities[i].rel = true;
}
return true;
}
public:
/*! Extracts the intensity for
the given SNP.
* @param cel The name of the CEL file.
* @param cdf The name of the CDF file.
* @param snp The name of the SNP.
* @param intensities The CEL file intensities
associated with the snp.
* @return True if successful.
*/
bool ExtractSNPIntensities(const string &cel, const
string &cdf, const string &snp, vector<SNPData>
&intensities)
{
intensities.clear();
if (ReadCDF(cdf) == false)
return false;
if (ExtractSet(snp) == false)
return false;
if (ReadCEL(cel) == false)
return false;
if (ExtractIntensities(intensities) == false)
return false;
return true;
}
};
/*! Gets a file from the command
line argument.
* @param argc The number of arguments in the argv array.
* @param argv The command line arguments.
* @return The file.
*/
string get_file(int argc, char *argv[], int &i)
{
string file = "";
// Get the file name.
int j=i+1;
while (j<argc && argv[j][0] != '-')
{
file += argv[j];
if (j<argc-1 && argv[j+1][0] != '-')
file += " ";
++j;
}
i = j-1;
return file;
}
/*! This example program will
extract intensity data from a CEL file for an associated
SNP.
* @param argc The number of arguments in the argv array.
* @param argv The command line arguments.
* @return The status.
*/
int main(int argc, char **argv)
{
// Show the command line
arguments.
if (argc == 1)
{
cout << "Synopsis: <-cel cel_file_name> <-cdf
cdf_file_name> <-snp snp_id>" << endl;
return 0 ;
}
// Get the command line
arguments.
int i=1;
string cel;
string cdf;
string snp;
while(i<argc)
{
if (strcmp(argv[i], "-cel") == 0)
cel = get_file(argc, argv, i);
else if (strcmp(argv[i], "-cdf") == 0)
cdf = get_file(argc, argv, i);
else if (strcmp(argv[i], "-snp") == 0)
snp = argv[i+1];
++i;
}
// Extract the data and print
the results.
ExtractSNP ext;
vector<NP::SNPData> inten;
if (ext.ExtractSNPIntensities(cel, cdf, snp, inten)
== true)
{
for (int i=0; i<(int)inten.size(); i++)
{
cout << "[" << i << "]" << endl;
cout << "pmA=" << inten[i].pmA << endl;
cout << "mmA=" << inten[i].mmA << endl;
cout << "pmA=" << inten[i].pmB << endl;
cout << "mmA=" << inten[i].mmB << endl;
cout << "pos=" << inten[i].pos << endl;
cout << "rel=" << (inten[i].rel == true ?
"true" : "false") << endl;
}
}
return 0;
}

