ADSP t-SNE

From Bradwiki
Revision as of 03:50, 6 February 2018 by Bradley Monk (talk | contribs)
Jump to navigation Jump to search

The Alzheimer's Disease Sequencing Project (ADSP) has made available WES data in vcf format. The following is a step-by-step analysis walkthrough of this dataset. Here the ultimate goal is to develop a platform for Alzheimer's Disease diagnosis based on genome sequencing information. The data for this analysis can be downloaded here:

ADSP_WES_VCF_LATEST_RELEASE.mat.

Both the dataset and the following analysis code are in MATLAB format. Note however there is an R package for reading .mat files. All MATLAB code and custom functions used in the demo walkthrough below can be downloaded here:

GENOS GITHUB CODE REPO
Other Analyses










tSNE : t-Distributed Stochastic Neighbor Embedding




  1 % ######################################################################
  2 %%       tSNE : t-Distributed Stochastic Neighbor Embedding
  3 % ######################################################################
  4 clc; close all; clear; rng('shuffle')
  5 cd(fileparts(which('GENOS.m')));
  6 
  7 
  8 MATDATA = 'ADSPdata.mat';
  9 which(MATDATA)
 10 load(MATDATA)
 11 
 12 clearvars -except ADSP
 13 
 14 
 15 
 16 %% CARBON COPY MAIN VARIABLES FROM ADSP.STRUCT
 17 
 18 LOCI = ADSP.LOCI(:,1:17);
 19 CASE = ADSP.CASE;
 20 CTRL = ADSP.CTRL;
 21 PHEN = ADSP.PHEN;
 22 
 23 clearvars -except ADSP LOCI CASE CTRL PHEN
 24 
 25 
 26 
 27 
 28 
 29 %###############################################################
 30 %%       DETERMINE WHICH PARTICIPANTS TO KEEP
 31 %###############################################################
 32 
 33 
 34 
 35 PHE = PHEN(PHEN.TOTvars>14000,:);
 36 
 37 
 38 PHECASE = PHE(PHE.AD==1,:);
 39 PHECTRL = PHE(PHE.AD==0,:);
 40 
 41 
 42 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
 43 
 44 
 45 
 46 
 47 
 48 
 49 
 50 
 51 
 52 %###############################################################
 53 %%          COUNT NUMBER OF VARIANTS PER LOCI
 54 %###############################################################
 55 
 56 % The varsum() function will go through each known variant loci
 57 % and check whether anyone's SRR ID from your subset of IDs match
 58 % all known SRR IDs for that loci. It will then sum the total
 59 % number of alleles (+1 for hetzy-alt, +2 for homzy-alt) for each
 60 % loci and return the totals.
 61 
 62 
 63 [CASEN, CTRLN] = varsum(CASE, PHECASE.SRR, CTRL, PHECTRL.SRR);
 64 
 65 
 66 % SAVE COUNTS AS NEW TABLE COLUMNS
 67 LOCI.CASEREFS = numel(PHECASE.SRR)*2-CASEN;
 68 LOCI.CTRLREFS = numel(PHECTRL.SRR)*2-CTRLN;
 69 LOCI.CASEALTS = CASEN;
 70 LOCI.CTRLALTS = CTRLN;
 71 
 72 
 73 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
 74 
 75 
 76 
 77 
 78 
 79 
 80 
 81 %###############################################################
 82 %%               COMPUTE FISHER'S P-VALUE
 83 %###############################################################
 84 
 85 
 86 % COMPUTE FISHERS STATISTICS FOR THE TRAINING GROUP
 87 [FISHP, FISHOR] = fishp_mex(LOCI.CASEREFS,LOCI.CASEALTS,...
 88                             LOCI.CTRLREFS,LOCI.CTRLALTS);
 89 
 90 LOCI.FISHPS  = FISHP;
 91 LOCI.FISHORS = FISHOR;
 92 
 93 
 94 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
 95 
 96 
 97 
 98 
 99 
100 %% MAKE LATEST COUNTS THE MAIN TABLE STATS
101 
102 LOCI.CASEREF = LOCI.CASEREFS;
103 LOCI.CTRLREF = LOCI.CTRLREFS;
104 LOCI.CASEALT = LOCI.CASEALTS;
105 LOCI.CTRLALT = LOCI.CTRLALTS;
106 LOCI.FISHP   = LOCI.FISHPS;
107 LOCI.FISHOR  = LOCI.FISHORS;
108 
109 
110 
111 
112 
113 
114 %% SORT VARIANT LOCI TABLE BY FISHER P-VALUE
115 
116 [X,i] = sort(LOCI.FISHP);
117 
118 LOCI  = LOCI(i,:);
119 CASE  = CASE(i);
120 CTRL  = CTRL(i);
121 LOCI.VID = (1:size(LOCI,1))';
122 
123 LOCI.GENE = string(LOCI.GENE);
124 
125 
126 
127 clc; clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
128 disp(LOCI(1:9,:))
129 
130 
131 
132 
133 
134 %% STORE VARIABLES FOR PCA/TSNE AS 'AMX'
135 
136 AMX         = LOCI;
137 AMXCASE     = CASE;
138 AMXCTRL     = CTRL;
139 
140 
141 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
142 AMX AMXCASE AMXCTRL 
143 
144 
145 
146 
147 
148 %% FILTER VARIANTS BASED ALT > REF
149 
150 PASS = (AMX.CASEREF > AMX.CASEALT./1.5) | (AMX.CTRLREF > AMX.CTRLALT./1.5);
151 sum(~PASS)
152 
153 AMX      = AMX(PASS,:);
154 AMXCASE  = AMXCASE(PASS);
155 AMXCTRL  = AMXCTRL(PASS);
156 AMX.VID  = (1:size(AMX,1))';
157 
158 
159 
160 
161 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
162 AMX AMXCASE AMXCTRL 
163 
164 
165 
166 
167 
168 %% TAKE THE TOP N NUMBER OF VARIANTS
169 N = 100;
170 
171 
172 AMX      = AMX(1:N,:);
173 AMXCASE  = AMXCASE(1:N);
174 AMXCTRL  = AMXCTRL(1:N);
175 AMX.VID  = (1:size(AMX,1))';
176 
177 fprintf('\n %.0f final loci count \n\n',size(AMX,1))
178 
179 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
180 AMX AMXCASE AMXCTRL 
181 
182 
183 
184 
185 
186 
187 
188 
189 %% MAKE  RECTANGLE  NN VARIANT MATRIX
190 
191 
192 [ADNN, caMX, coMX] = varmx(AMX,AMXCASE,AMXCTRL,PHE);
193 
194 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
195 AMX AMXCASE AMXCTRL ADNN 
196 
197 
198 
199 
200 
201 
202 
203 
204 
205 %% RANDOMIZE ADNN AND REORDER PHE TO MATCH ADNN
206 
207 ADL = ADNN(1,:);
208 ADN = ADNN(2:end,:);
209 i = randperm(size(ADN,1));
210 ADN = ADN(i,:);
211 ADNN = [ADL;ADN];
212 
213 
214 [i,j] = ismember(PHE.SRR, ADN(:,1) );
215 PHE.USED = i;
216 PHE.ORDER = j;
217 PHE = PHE(PHE.USED,:);
218 PHE = sortrows(PHE,'ORDER');
219 
220 
221 
222 PCAMX = ADNN(2:end,4:end);
223 
224 
225 clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
226 AMX AMXCASE AMXCTRL ADNN PCAMX  
227 
228 
229 
230 
231 %% (OPTIONAL) PRE-PERFORM PCA BEFORE TSNE 
232 
233 % ss = statset('pca');
234 % ss.Display = 'iter';
235 % ss.MaxIter = 100;
236 % ss.TolFun = 1e4;
237 % ss.TolX = 1e4;
238 % ss.UseParallel = true;
239 % 
240 % [PCAC,PCAS,~,~,~] = pca(  PCAMX' , 'Options',ss);
241 % clc; close all; scatter(PCAC(:,1),PCAC(:,2))
242 %
243 % % ...,'NumPCAComponents',0,...  means don't use PCA
244 % tSN = tsne(PCAC(:,1:10),'NumDimensions',2,'Theta',.6,'NumPCAComponents',0);
245 % 
246 % clearvars -except ADSP GENB LOCI CASE CTRL PHEN AMX AMXCASE AMXCTRL...
247 % PHE ADNN PCAMX tSN PCAC PCAS
248 
249 
250 
251 
252 
253 
254 %######################################################################
255 %%       tSNE : t-Distributed Stochastic Neighbor Embedding
256 %######################################################################
257 
258 
259 
260 tSN = tsne(PCAMX,'NumDimensions',2,'Theta',.6,'NumPCAComponents',8);
261 
262 
263 disp('done')
264 clearvars -except ADSP GENB LOCI CASE CTRL PHEN AMX AMXCASE AMXCTRL...
265 PHE ADNN PCAMX tSN PCAC PCAS




t-SNE Plots






ALZHEIMER'S STATUS

 1 %% PLOT TSNE --- ALZHEIMER'S STATUS (CASE/CTRL) --------------------------
 2 close all; 
 3 fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
 4 ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
 5 
 6 ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.AD, [],'.',15);
 7 
 8 title({'\fontsize{16} t-SNE : CASE vs CTRL',' '})
 9 legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
10 axis off

TSNE Case Control.png TSNE Case Control 2kvars.png




STUDY COHORT

 1 %% PLOT TSNE --- CONSORTIUM STUDY COHORT (1:24) -------------------------
 2 close all; 
 3 fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
 4 ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
 5 
 6 ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.COHORT, [],'.',15);
 7 
 8 
 9 title({'\fontsize{16} t-SNE : STUDY COHORT',' '})
10 % legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
11 axis off

TSNE Study Cohort.png TSNE Study Cohort 2kvars.png



SEX

 1 %% PLOT TSNE --- SEX (M/F) ----------------------------------------------
 2 close all; 
 3 fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
 4 ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
 5 
 6 ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.SEX, [],'.',15);
 7 
 8 
 9 title({'\fontsize{16} t-SNE : SEX',' '})
10 legend(ph1,{'Male','Female'},'FontSize',12,'Box','off','Location','NorthWest');
11 axis off

TSNE Sex.png TSNE Sex 2kvars.png



AGE

 1 %% PLOT TSNE --- AGE (BINNED AGE) ---------------------------------------
 2 close all; 
 3 fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
 4 ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
 5 
 6 AGE = round(PHE.AGE);
 7 ofAGE = AGE>60;
 8 A = AGE(ofAGE);
 9 
10 histogram(AGE)
11 
12 [Y,E] = discretize(A,[60 80 90 91]);
13 % [Y,E] = discretize(A,[60 75 85 90 91]);
14 for nn = 1:numel(E)
15 A(Y==nn) = E(nn);
16 end
17 
18 ph1 = gscatter(tSN(ofAGE,1),tSN(ofAGE,2),  A, [],'.',15);
19 
20 
21 title({'\fontsize{16} t-SNE : AGE',' '})
22 % legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
23 axis off

TSNE Age.png TSNE Age 2kvars.png



APOE STATUS

 1 %% PLOT TSNE --- APOE STATUS (22,23,24,33,34,44) ------------------------
 2 close all; 
 3 fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
 4 ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
 5 
 6 
 7 ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.APOE, [],'.',15);
 8 
 9 
10 ph1(1).MarkerSize = 35;
11 ph1(2).MarkerSize = 25;
12 ph1(2).Color = [.20 .20 .99];
13 ph1(3).MarkerSize = 35;
14 ph1(4).Color = [.99 .50 .10];
15 ph1(5).Color = [.30 .70 .80];
16 ph1(6).MarkerSize = 25;
17 
18 title({'\fontsize{16} t-SNE : APOE',' '})
19 % legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
20 axis off

TSNE APOE.png TSNE APOE 2kvars.png



CONSENT GROUP

 1 %% PLOT TSNE --- CONSENT GROUP ------------------------------------------
 2 close all; 
 3 fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
 4 ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
 5 
 6 
 7 ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.RD, [],'.',15);
 8 
 9 
10 title({'\fontsize{16} t-SNE : CONSENT GROUP',' '})
11 % legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
12 axis off

TSNE Consent 2kvars.png






















Additional Genomics Analyses




Other Analyses










Notes


Category:ADSP