ADSP PCA

From Bradwiki
Jump to: navigation, search

PCA is a numerical transformation that performs dimensionality reduction, often for data visualization purposes. Like tSNE, PCA helps revealing clusters in high-dimensional data. PCA allows you to visualize two or three components directly against each at the same time.

Other Analyses


Again, this code can be downloaded from the: GENOS GIT




PCA Code




  1 % ######################################################################
  2 %%                PRINCIPAL COMPONENTS ANALYSIS
  3 % ######################################################################
  4 
  5 
  6 clc; close all; clear; rng('shuffle')
  7 cd(fileparts(which('GENOS.m')));
  8 
  9 
 10 MATDATA = 'ADSPdata.mat';
 11 which(MATDATA)
 12 load(MATDATA)
 13 
 14 clearvars -except ADSP
 15 
 16 
 17 
 18 %% CARBON COPY MAIN VARIABLES FROM ADSP.STRUCT
 19 
 20 LOCI = ADSP.LOCI(:,1:17);
 21 CASE = ADSP.CASE;
 22 CTRL = ADSP.CTRL;
 23 PHEN = ADSP.PHEN;
 24 
 25 clearvars -except ADSP LOCI CASE CTRL PHEN
 26 
 27 
 28 
 29 
 30 
 31 %###############################################################
 32 %%       DETERMINE WHICH PARTICIPANTS TO KEEP
 33 %###############################################################
 34 
 35 
 36 
 37 PHE = PHEN(PHEN.TOTvars>14000,:);
 38 
 39 
 40 PHECASE = PHE(PHE.AD==1,:);
 41 PHECTRL = PHE(PHE.AD==0,:);
 42 
 43 
 44 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
 45 
 46 
 47 
 48 %###############################################################
 49 %%          COUNT NUMBER OF VARIANTS PER LOCI
 50 %###############################################################
 51 
 52 % The varsum() function will go through each known variant loci
 53 % and check whether anyone's SRR ID from your subset of IDs match
 54 % all known SRR IDs for that loci. It will then sum the total
 55 % number of alleles (+1 for hetzy-alt, +2 for homzy-alt) for each
 56 % loci and return the totals.
 57 
 58 
 59 [CASEN, CTRLN] = varsum(CASE, PHECASE.SRR, CTRL, PHECTRL.SRR);
 60 
 61 
 62 % SAVE COUNTS AS NEW TABLE COLUMNS
 63 LOCI.CASEREFS = numel(PHECASE.SRR)*2-CASEN;
 64 LOCI.CTRLREFS = numel(PHECTRL.SRR)*2-CTRLN;
 65 LOCI.CASEALTS = CASEN;
 66 LOCI.CTRLALTS = CTRLN;
 67 
 68 
 69 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
 70 
 71 
 72 
 73 
 74 
 75 
 76 
 77 %###############################################################
 78 %%               COMPUTE FISHER'S P-VALUE
 79 %###############################################################
 80 
 81 
 82 % COMPUTE FISHERS STATISTICS FOR THE TRAINING GROUP
 83 [FISHP, FISHOR] = fishp_mex(LOCI.CASEREFS,LOCI.CASEALTS,...
 84                             LOCI.CTRLREFS,LOCI.CTRLALTS);
 85 
 86 LOCI.FISHPS  = FISHP;
 87 LOCI.FISHORS = FISHOR;
 88 
 89 
 90 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
 91 
 92 
 93 
 94 
 95 
 96 %% MAKE LATEST COUNTS THE MAIN TABLE STATS
 97 
 98 LOCI.CASEREF = LOCI.CASEREFS;
 99 LOCI.CTRLREF = LOCI.CTRLREFS;
100 LOCI.CASEALT = LOCI.CASEALTS;
101 LOCI.CTRLALT = LOCI.CTRLALTS;
102 LOCI.FISHP   = LOCI.FISHPS;
103 LOCI.FISHOR  = LOCI.FISHORS;
104 
105 
106 
107 
108 
109 
110 %% SORT VARIANT LOCI TABLE BY FISHER P-VALUE
111 
112 [X,i] = sort(LOCI.FISHP);
113 
114 LOCI  = LOCI(i,:);
115 CASE  = CASE(i);
116 CTRL  = CTRL(i);
117 LOCI.VID = (1:size(LOCI,1))';
118 
119 LOCI.GENE = string(LOCI.GENE);
120 
121 
122 
123 clc; clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
124 disp(LOCI(1:9,:))
125 
126 
127 
128 
129 
130 %% STORE VARIABLES FOR PCA/TSNE AS 'AMX'
131 
132 AMX         = LOCI;
133 AMXCASE     = CASE;
134 AMXCTRL     = CTRL;
135 
136 
137 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
138 AMX AMXCASE AMXCTRL 
139 
140 
141 
142 
143 
144 %% FILTER VARIANTS BASED ALT > REF
145 
146 PASS = (AMX.CASEREF > AMX.CASEALT./1.5) | (AMX.CTRLREF > AMX.CTRLALT./1.5);
147 sum(~PASS)
148 
149 AMX      = AMX(PASS,:);
150 AMXCASE  = AMXCASE(PASS);
151 AMXCTRL  = AMXCTRL(PASS);
152 AMX.VID  = (1:size(AMX,1))';
153 
154 
155 
156 
157 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
158 AMX AMXCASE AMXCTRL 
159 
160 
161 
162 
163 
164 %% TAKE THE TOP N NUMBER OF VARIANTS
165 N = 50;
166 
167 
168 AMX      = AMX(1:N,:);
169 AMXCASE  = AMXCASE(1:N);
170 AMXCTRL  = AMXCTRL(1:N);
171 AMX.VID  = (1:size(AMX,1))';
172 
173 fprintf('\n %.0f final loci count \n\n',size(AMX,1))
174 
175 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
176 AMX AMXCASE AMXCTRL 
177 
178 
179 
180 
181 
182 
183 
184 
185 %% MAKE  RECTANGLE  NN VARIANT MATRIX
186 
187 
188 [ADNN, caMX, coMX] = varmx(AMX,AMXCASE,AMXCTRL,PHE);
189 
190 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
191 AMX AMXCASE AMXCTRL ADNN 
192 
193 
194 
195 
196 
197 
198 
199 
200 
201 %% RANDOMIZE ADNN AND REORDER PHE TO MATCH ADNN
202 
203 ADL = ADNN(1,:);
204 ADN = ADNN(2:end,:);
205 i = randperm(size(ADN,1));
206 ADN = ADN(i,:);
207 ADNN = [ADL;ADN];
208 
209 
210 [i,j] = ismember(PHE.SRR, ADN(:,1) );
211 PHE.USED = i;
212 PHE.ORDER = j;
213 PHE = PHE(PHE.USED,:);
214 PHE = sortrows(PHE,'ORDER');
215 
216 
217 
218 PCAMX = ADNN(2:end,4:end);
219 
220 
221 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
222 AMX AMXCASE AMXCTRL ADNN PCAMX  
223 
224 
225 
226 %######################################################################
227 %%       PCA : PRINCIPAL COMPONENTS ANALYSIS
228 %######################################################################
229 
230 
231 ss = statset('pca')
232 ss.Display = 'iter';
233 ss.MaxIter = 100;
234 ss.TolFun = 1e4;
235 ss.TolX = 1e4;
236 ss.UseParallel = true;
237 
238 % 'NumComponents',5,  'Algorithm','eig', Elapsed time is 70.459438 seconds.
239 % [PCAC,PCAS,PCAlatent,PCAtsquared,PCAE] = pca(  PCAMX' ,'Options',ss); 
240 
241 
242 % PCAMX(PCAMX>0) = 1;
243 
244 [PCAC,PCAS,~,~,PCAE] = pca(  PCAMX' , 'Options',ss);
245 
246 
247 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
248 AMX AMXCASE AMXCTRL ADNN PCAMX PCAC PCAS PCAE  
249 
250 
251 
252 
253 
254 
255 
256 %% RECONSTRUCT ORIGINAL DATA (OPTIONAL TEST)
257 
258 
259 
260 % MEAN DEVIATE (CENTER) DATA
261 mu = mean(PCAMX);
262 Xi = bsxfun(@minus,PCAMX,mu);
263 
264 
265 % Reconstruct the centered data
266 Xj = PCAS*PCAC';
267 
268 % Reconstruct original data
269 PCAMX_REDUX = round(Xj' + mu);
270 
271 
272 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
273 AMX AMXCASE AMXCTRL ADNN PCAMX PCAC PCAS PCAE PCAMX_REDUX




PCA Plots





ALL PCs in 1D

 1 %% PLOT PCA --- ALL PCs IN 1D -------------------------------------
 2 clc; close all;
 3 fh1=figure('Units','normalized','OuterPosition',[.02 .06 .9 .8],'Color','w');
 4 hax1 = axes('Position',[.08 .08 .40 .80],'Color','none');
 5 hax2 = axes('Position',[.56 .08 .40 .80],'Color','none');
 6 
 7 
 8 Tx = PHE.COHORTNUM;
 9 Tn = unique(Tx);
10 Ts = 'Cohort';
11 
12 % axes(hax1); colormap(hax1, (prism(max(Tn))) );
13 axes(hax1); colormap(hax1, flipud(jet(max(Tn))) );
14 
15 ph1 = scatter( (1:numel(PCAC(:,1))).*0+1  , PCAC(:,1),50, Tx,'o');
16 hold on
17 ph2 = scatter( (1:numel(PCAC(:,2))).*0+2  , PCAC(:,2),50, Tx,'o');
18 hold on
19 ph3 = scatter( (1:numel(PCAC(:,3))).*0+3  , PCAC(:,3),900, Tx,'.');
20 hold on
21 ph4 = scatter( (1:numel(PCAC(:,4))).*0+4  , PCAC(:,4),900, Tx,'.');
22 hold on
23 ph5 = scatter( (1:numel(PCAC(:,5))).*0+5  , PCAC(:,5),900, Tx,'.');
24 
25 
26 hax1.XLim = [0 6];
27 
28 
29 text(1,max(ph1.YData),sprintf(['explains: \n ' num2str(round(PCAE(1),2)) '%% \n']),...
30 'Interpreter','tex','HorizontalAlignment','center','VerticalAlignment','bottom','FontSize',14);
31 
32 text(2,max(ph2.YData),sprintf(['explains: \n ' num2str(round(PCAE(2),2)) '%% \n']),...
33 'Interpreter','tex','HorizontalAlignment','center','VerticalAlignment','bottom','FontSize',14);
34 
35 text(3,max(ph3.YData),sprintf(['explains: \n ' num2str(round(PCAE(3),2)) '%% \n']),...
36 'Interpreter','tex','HorizontalAlignment','center','VerticalAlignment','bottom','FontSize',14);
37 
38 text(4,max(ph4.YData),sprintf(['explains: \n ' num2str(round(PCAE(4),2)) '%% \n']),...
39 'Interpreter','tex','HorizontalAlignment','center','VerticalAlignment','bottom','FontSize',14);
40 
41 text(5,max(ph5.YData),sprintf(['explains: \n ' num2str(round(PCAE(5),2)) '%% \n']),...
42 'Interpreter','tex','HorizontalAlignment','center','VerticalAlignment','bottom','FontSize',14);
43 
44 
45 
46 
47 xlabel('\bf PCA1 - PCA5');ylabel('\bf Eigenvector Coefficients');
48 cb=colorbar('Ticks',unique(Tx),'TickLabels',(unique(Tx)),'Direction','reverse');
49 cb.Label.String = Ts; cb.Label.FontSize = 18; cb.Label.Rotation = -90;
50 cb.Label.VerticalAlignment = 'baseline';
51 
52 
53 
54 
55 
56 axes(hax2); colormap(hax2, (prism(max(Tn))) );
57 
58 ph1 = scatter( (1:numel(PCAC(:,1))).*0+1  , PCAC(:,1),50, Tx,'o','MarkerEdgeAlpha',.02);
59 hold on
60 ph2 = scatter( (1:numel(PCAC(:,2))).*0+2  , PCAC(:,2),50, Tx,'o','MarkerEdgeAlpha',.02);
61 hold on
62 ph3 = scatter( (1:numel(PCAC(:,3))).*0+3  , PCAC(:,3),50, Tx,'o','MarkerEdgeAlpha',.02);
63 hold on
64 ph4 = scatter( (1:numel(PCAC(:,4))).*0+4  , PCAC(:,4),50, Tx,'o','MarkerEdgeAlpha',.02);
65 hold on
66 ph5 = scatter( (1:numel(PCAC(:,5))).*0+5  , PCAC(:,5),50, Tx,'o','MarkerEdgeAlpha',.02);
67 
68 hax2.XLim = [0 6];

Top 50 variants PCA 1D 50vars.png

Top 2000 variants PCA 1D 2000vars.png




ALZHEIMER'S STATUS

 1 %% PLOT PCA --- CASE-vs-CTRL ------------------------------------------
 2 clc; close all;
 3 fh1=figure('Units','normalized','OuterPosition',[.02 .06 .97 .8],'Color','w');
 4 hax1 = axes('Position',[.05 .07 .40 .85],'Color','none');
 5 hax2 = axes('Position',[.52 .07 .40 .85],'Color','none');
 6 
 7 
 8 Tx = PHE.AD;
 9 Tn = unique(Tx);
10 Ts = 'CASE-vs-CTRL';
11 
12 
13 axes(hax1); colormap(hax1, (lines(max(Tn)+1)) );
14 ph1 = scatter(PCAC(:,1), PCAC(:,2),500, Tx,'.');
15 xlabel('\bf PCA1');ylabel('\bf PCA2');
16 
17 cb=colorbar('Ticks',Tn,'TickLabels',Tn,'Direction','reverse');
18 cb.Label.String = Ts; cb.Label.FontSize = 14; cb.Label.Rotation = -90;
19 cb.Label.VerticalAlignment = 'baseline';
20 
21 
22 axes(hax2); colormap(hax2, (lines(max(Tn)+1)) );
23 ph2 = scatter3(PCAC(:,1), PCAC(:,2), PCAC(:,3),500, Tx,'.');
24 xlabel('\bf PCA1');ylabel('\bf PCA2');zlabel('\bf PCA3');
25 view([65,20])

Top 50 variants PCA 2D 3D CASE CTRL 50vars.png

Top 2000 variants PCA 2D 3D CASE CTRL 2000vars.png




STUDY COHORT

 1 %% PLOT PCA --- COHORT ------------------------------------------
 2 clc; close all;
 3 fh1=figure('Units','normalized','OuterPosition',[.02 .06 .97 .8],'Color','w');
 4 hax1 = axes('Position',[.05 .07 .40 .85],'Color','none');
 5 hax2 = axes('Position',[.52 .07 .40 .85],'Color','none');
 6 
 7 
 8 Tx = PHE.COHORTNUM;
 9 Tn = unique(Tx);
10 Ts = 'Cohort';
11 
12 
13 axes(hax1); colormap(hax1, flipud(jet(max(Tn))) );
14 ph1 = scatter(PCAC(:,1), PCAC(:,2),500, Tx,'.');
15 xlabel('\bf PCA1');ylabel('\bf PCA2');
16 
17 cb=colorbar('Ticks',Tn,'TickLabels',Tn,'Direction','reverse');
18 cb.Label.String = Ts; cb.Label.FontSize = 14; cb.Label.Rotation = -90;
19 cb.Label.VerticalAlignment = 'baseline';
20 
21 
22 axes(hax2); colormap(hax2, flipud(jet(max(Tn))) );
23 ph2 = scatter3(PCAC(:,1), PCAC(:,2), PCAC(:,3),500, Tx,'.');
24 xlabel('\bf PCA1');ylabel('\bf PCA2');zlabel('\bf PCA3');
25 view([65,20])

Top 50 variants PCA 2D 3D Cohort 50vars.png

Top 2000 variants PCA 2D 3D Cohort 2000vars.png

AGE

 1 %% PLOT PCA --- AGE ------------------------------------------
 2 clc; close all;
 3 fh1=figure('Units','normalized','OuterPosition',[.02 .06 .97 .8],'Color','w');
 4 hax1 = axes('Position',[.05 .07 .40 .85],'Color','none');
 5 hax2 = axes('Position',[.52 .07 .40 .85],'Color','none');
 6 
 7 AGE = round(PHE.AGE);
 8 [Y,E] = discretize(AGE,8);
 9 for nn = 1:numel(E)
10 AGE(Y==nn) = E(nn);
11 end
12 
13 Tx = AGE(AGE>60);
14 Tn = unique(Tx);
15 Ts = 'AGE-STATUS';
16 
17 PCAage = PCAC(AGE>60,:);
18 
19 
20 axes(hax1); colormap(hax1, (jet(numel(Tn))) );
21 % axes(hax1); colormap(hax1, flipud(jet(max(Tn))) );
22 ph1 = scatter(PCAage(:,1), PCAage(:,2),500, Tx,'.');
23 xlabel('\bf PCA1');ylabel('\bf PCA2');
24 
25 cb=colorbar('Ticks',Tn,'TickLabels',Tn,'Direction','reverse');
26 cb.Label.String = Ts; cb.Label.FontSize = 14; cb.Label.Rotation = -90;
27 cb.Label.VerticalAlignment = 'baseline';
28 
29 
30 axes(hax2); colormap(hax2, (jet(numel(Tn))) );
31 % axes(hax2); colormap(hax2, flipud(jet(max(Tn))) );
32 ph2 = scatter3(PCAage(:,1), PCAage(:,2), PCAage(:,3),500, Tx,'.');
33 xlabel('\bf PCA1');ylabel('\bf PCA2');zlabel('\bf PCA3');
34 view([65,20])

Top 50 variants PCA 2D 3D AGE 50vars.png

Top 2000 variants PCA 2D 3D AGE 2000vars.png



APOE STATUS

 1 %% PLOT PCA --- APOE-STATUS ------------------------------------------
 2 clc; close all;
 3 fh1=figure('Units','normalized','OuterPosition',[.02 .06 .97 .8],'Color','w');
 4 hax1 = axes('Position',[.05 .07 .40 .85],'Color','none');
 5 hax2 = axes('Position',[.52 .07 .40 .85],'Color','none');
 6 
 7 
 8 APOE = unique(PHE.APOE);
 9 PHE.APOEID = PHE.SEX .* 0;
10 for nn = 1:numel(APOE)
11     PHE.APOEID(PHE.APOE == APOE(nn)) = nn;
12 end
13 
14 
15 Tx = PHE.APOEID;
16 Tn = unique(Tx);
17 Ts = 'APOE-STATUS';
18 
19 
20 axes(hax1); colormap(hax1, (lines(numel(Tn))) );
21 % axes(hax1); colormap(hax1, flipud(jet(max(Tn))) );
22 ph1 = scatter(PCAC(:,1), PCAC(:,2),500, Tx,'.');
23 xlabel('\bf PCA1');ylabel('\bf PCA2');
24 
25 cb=colorbar('Ticks',Tn,'TickLabels',APOE,'Direction','reverse');
26 cb.Label.String = Ts; cb.Label.FontSize = 14; cb.Label.Rotation = -90;
27 cb.Label.VerticalAlignment = 'baseline';
28 
29 
30 axes(hax2); colormap(hax2, (lines(numel(Tn))) );
31 % axes(hax2); colormap(hax2, flipud(jet(max(Tn))) );
32 ph2 = scatter3(PCAC(:,1), PCAC(:,2), PCAC(:,3),500, Tx,'.');
33 xlabel('\bf PCA1');ylabel('\bf PCA2');zlabel('\bf PCA3');
34 view([65,20])

Top 50 variants PCA 2D 3D APOE 50vars.png

Top 2000 variants PCA 2D 3D APOE 2000vars.png
























Additional Genomics Analyses




Other Analyses










Notes


Category:ADSP