13.2.17. Macro “dataserverPCAExample.C

13.2.17.1. Objective

The goal of this macro is to show how to handle a PCA analysis. It is not much discussed here, as a large description of both methods and concepts is done in Combining these aspects: performing PCA.

13.2.17.2. Macro Uranie

    // Read the database
    TDataServer * tdsPCA = new TDataServer("tdsPCA", "my TDS");
    tdsPCA->fileDataRead("Notes.dat");

    // Create the PCA object precising the variables of interest
    TPCA * tpca = new TPCA(tdsPCA, "Maths:Physics:French:Latin:Music");
    tpca->compute();

    bool graphical=true; // do graphs
    bool dumponscreen=true; //or dumping results  
    bool showcoordinate=false; // show the coordinate of the points while dumping results

    if(graphical) {

        // Draw all point in PCA planes
        TCanvas *cPCA = new TCanvas("cpca", "PCA",800,800);
        TPad *apad1 = new TPad("apad1","apad1",0, 0.03, 1, 1); apad1->Draw(); apad1->cd();
        apad1->Divide(2,2);
        apad1->cd(1); tpca->drawPCA(1,2,"Pupil");
        apad1->cd(3); tpca->drawPCA(1,3,"Pupil");
        apad1->cd(4); tpca->drawPCA(2,3,"Pupil");

        // Draw all variable weight in PC definition  
        TCanvas *cLoading = new TCanvas("cLoading", "Loading Plot",800,800);
        TPad *apad2 = new TPad("apad2","apad2",0, 0.03, 1, 1); apad2->Draw(); apad2->cd();
        apad2->Divide(2,2);
        apad2->cd(1); tpca->drawLoading(1,2);
        apad2->cd(3); tpca->drawLoading(1,3);
        apad2->cd(4); tpca->drawLoading(2,3);

        // Draw the eigen values in different normalisation
        TCanvas *c = new TCanvas("cEigenValues", "Eigen Values Plot",1100,500);
        TPad *apad3 = new TPad("apad3","apad3",0, 0.03, 1, 1);
        apad3->Draw();
        apad3->cd();
        apad3->Divide(3,1);
        TNtupleD *ntd = tpca->getResultNtupleD();
        apad3->cd(1); ntd->Draw("eigen:i","","lp");
        apad3->cd(2); ntd->Draw("eigen_pct:i","","lp"); gPad->SetGrid();
        apad3->cd(3); ntd->Draw("sum_eigen_pct:i","","lp"); gPad->SetGrid();
    }

    if(dumponscreen)
    {
        
        int nPCused=5; // 3 to see only the meaningful ones
        TString PCname="", Cosname="", Contrname="", Variable="Pupil";
        for (unsigned int iatt=1; iatt<=nPCused; iatt++)
        {
            //list of PC: PC_1:PC_2:PC_3:PC_4:PC_5
            PCname+=Form("PC_%d",iatt)+((iatt!=nPCused)?TString(":"):TString(""));
            //list of quality coeff: cosca_1:cosca_2:cosca_3:cosca_4:cosca_5
            Cosname+=Form("cosca_%d",iatt)+((iatt!=nPCused)?TString(":"):TString(""));
            //list of contribution: contr_1:contr_2:contr_3:contr_4:contr_5
            Contrname+=Form("contr_%d",iatt)+((iatt!=nPCused)?TString(":"):TString(""));
        }

        cout<<endl<<"========= EigenValues ====================="<<endl;
        tpca->getResultNtupleD()->Scan("*");

        if(showcoordinate)
        {
            cout<<endl<<"========= EigenVectors ====================="<<endl;
            tpca->_matEigenVectors.Print();
        
            cout<<endl<<"========= New Coordinates ====================="<<endl;
            tdsPCA->scan( (Variable+":"+PCname).Data() );
        }
        
        TDSNtupleD *varRes  = tpca->getVariableResultNtupleD();
        cout<<endl<<"=============== Looking at variables: Quality of representation ====================="<<endl;
        varRes->Scan(("Variable:"+Cosname).Data());
        
        cout<<endl<<"==================== Looking at variables: Contribution to axis ====================="<<endl;
        varRes->Scan(("Variable:"+Contrname).Data());
        
        cout<<endl<<"================== Looking at events:  Quality of representation ===================="<<endl;
        tdsPCA->scan((Variable+":"+Cosname).Data());
        
        cout<<endl<<"===================== Looking at events: Contribution to axis ======================="<<endl;
        tdsPCA->scan((Variable+":"+Contrname).Data());
    }

This first part of this macro is described in Combining these aspects: performing PCA. We will focus here on the second part, where all the numerical results are dumped on screen. These results are stored in the dataserver for the points and in a dedicated ntuple for the variable that can be retrieved by calling the method getVariableResultNtupleD. In both cases, the results can be split into two kinds:

  • the quality of the representation: it is called “cosca_X” as it is a squared cosinus of the projection of the source under study (point or subject) on the X-th PC.

  • the contribution to axis: it is called “contr_X” as it is the contribution of the source under study (point or subject) to the definition of the X-th PC.

We start by defining the list of variables that one might want to display

        int nPCused=5; // 3 to see only the meaningful ones
        TString PCname="", Cosname="", Contrname="", Variable="Pupil";
        for (unsigned int iatt=1; iatt<=nPCused; iatt++)
        {
            //list of PC: PC_1:PC_2:PC_3:PC_4:PC_5
            PCname+=Form("PC_%d",iatt)+((iatt!=nPCused)?TString(":"):TString(""));
            //list of quality coeff: cosca_1:cosca_2:cosca_3:cosca_4:cosca_5
            Cosname+=Form("cosca_%d",iatt)+((iatt!=nPCused)?TString(":"):TString(""));
            //list of contribution: contr_1:contr_2:contr_3:contr_4:contr_5
            Contrname+=Form("contr_%d",iatt)+((iatt!=nPCused)?TString(":"):TString(""));
        }

From there, once the variable ntuple is retrieved, one can dump both the quality and contribution coefficients for the variable, here the subjects (it leads to the second and third block in the output shown in Console).

        TDSNtupleD *varRes  = tpca->getVariableResultNtupleD();
        cout<<endl<<"=============== Looking at variables: Quality of representation ====================="<<endl;
        varRes->Scan(("Variable:"+Cosname).Data());
        
        cout<<endl<<"==================== Looking at variables: Contribution to axis ====================="<<endl;
        varRes->Scan(("Variable:"+Contrname).Data());

Finally, one can do the same for the data points, with the same Scan method (which lead to the fourth and fifth block in the output shown in Console).

        cout<<endl<<"================== Looking at events:  Quality of representation ===================="<<endl;
        tdsPCA->scan((Variable+":"+Cosname).Data());
        
        cout<<endl<<"===================== Looking at events: Contribution to axis ======================="<<endl;
        tdsPCA->scan((Variable+":"+Contrname).Data());

13.2.17.3. Console

========= EigenValues =====================
************************************************************
*    Row   *       i.i * eigen.eig * eigen_pct * sum_eigen *
************************************************************
*        0 *         1 * 2.8618175 * 57.236350 * 57.236350 *
*        1 *         2 * 1.1506811 * 23.013622 * 80.249973 *
*        2 *         3 * 0.9831407 * 19.662814 * 99.912787 *
*        3 *         4 * 0.0039371 * 0.0787424 * 99.991530 *
*        4 *         5 * 0.0004234 * 0.0084696 *       100 *
************************************************************

=============== Looking at variables: Quality of representation =====================
************************************************************************************
*    Row   *  Variable *   cosca_1 *   cosca_2 *   cosca_3 *   cosca_4 *   cosca_5 *
************************************************************************************
*        0 *     Maths *  0.649485 *   0.32645 * 0.0235449 * 0.0003614 * 0.0001582 *
*        1 *   Physics *  0.804627 *  0.185581 * 0.0086212 * 0.0010515 * 0.0001193 *
*        2 *    French *  0.574697 *  0.373378 * 0.0509446 * 0.0008976 * 8.252e-05 *
*        3 *     Latin *  0.828562 *  0.157999 * 0.0117556 * 0.0016205 * 6.335e-05 *
*        4 *     Music * 0.0044470 *  0.107272 *  0.888274 * 5.976e-06 * 8.299e-08 *
************************************************************************************

==================== Looking at variables: Contribution to axis =====================
************************************************************************************
*    Row   *  Variable *   contr_1 *   contr_2 *   contr_3 *   contr_4 *   contr_5 *
************************************************************************************
*        0 *     Maths *  0.226948 *  0.283702 * 0.0239486 * 0.0917987 *  0.373602 *
*        1 *   Physics *  0.281159 *   0.16128 * 0.0087691 *  0.267082 *   0.28171 *
*        2 *    French *  0.200815 *  0.324485 * 0.0518182 *  0.228004 *  0.194878 *
*        3 *     Latin *  0.289523 *  0.137309 * 0.0119572 *  0.411598 *  0.149613 *
*        4 *     Music * 0.0015539 * 0.0932252 *  0.903507 * 0.0015180 * 0.0001959 *
************************************************************************************

================== Looking at events:  Quality of representation ====================
************************************************************************************
*    Row   *     Pupil *   cosca_1 *   cosca_2 *   cosca_3 *   cosca_4 *   cosca_5 *
************************************************************************************
*        0 *      Jean * 0.8854534 * 0.0522119 * 0.0619429 * 0.0002655 * 0.0001260 *
*        1 *     Aline * 0.7920409 * 0.0542262 * 0.1530381 * 0.0006354 * 5.916e-05 *
*        2 *     Annie * 0.4784294 * 0.4813342 * 0.0384099 * 0.0018007 * 2.560e-05 *
*        3 *   Monique * 0.8785990 * 0.0024790 * 0.1180158 * 0.0009035 * 2.557e-06 *
*        4 *    Didier * 0.8515216 * 0.1382946 * 0.0079754 * 0.0021718 * 3.640e-05 *
*        5 *     Andre * 0.2465355 * 0.3961581 * 0.3567663 * 9.471e-05 * 0.0004451 *
*        6 *    Pierre * 0.0263090 * 0.7670958 * 0.2060832 * 0.0004625 * 4.925e-05 *
*        7 *  Brigitte * 0.1876629 * 0.5897686 * 0.2211409 * 0.0013390 * 8.836e-05 *
*        8 *   Evelyne * 0.0583185 * 0.3457931 * 0.5953786 * 0.0004600 * 4.957e-05 *
************************************************************************************

===================== Looking at events: Contribution to axis =======================
************************************************************************************
*    Row   *     Pupil *   contr_1 *   contr_2 *   contr_3 *   contr_4 *   contr_5 *
************************************************************************************
*        0 *      Jean * 0.3012931 * 0.0441856 * 0.0613538 * 0.0656820 * 0.2897335 *
*        1 *     Aline * 0.0618830 * 0.0105370 * 0.0348056 * 0.0360894 * 0.0312404 *
*        2 *     Annie * 0.0401366 * 0.1004284 * 0.0093797 * 0.1098075 * 0.0145176 *
*        3 *   Monique * 0.3784615 * 0.0026558 * 0.1479781 * 0.2828995 * 0.0074454 *
*        4 *    Didier * 0.1484067 * 0.0599446 * 0.0040461 * 0.2751439 * 0.0428726 *
*        5 *     Andre * 0.0348742 * 0.1393737 * 0.1469047 * 0.0097384 * 0.4255425 *
*        6 *    Pierre * 0.0041001 * 0.2973223 * 0.0934888 * 0.0524003 * 0.0518702 *
*        7 *  Brigitte * 0.0157710 * 0.1232678 * 0.0540974 * 0.0817989 * 0.0501849 *
*        8 *   Evelyne * 0.0150734 * 0.2222843 * 0.4479453 * 0.0864396 * 0.0865925 *
************************************************************************************