ADSP t-SNE: Difference between revisions

From bradwiki
Jump to navigation Jump to search
No edit summary
Line 317: Line 317:
axis off
axis off
</syntaxhighlight>
</syntaxhighlight>
[[File: TSNE Case Control.png|600px]]
 
[[File: TSNE Case Control 2kvars.png|600px]]
<big>Top 100 variants</big>
[[File: TSNE Case Control.png|800px]]
 
<big>Top 2000 variants</big>
[[File: TSNE Case Control 2kvars.png|800px]]




Line 340: Line 344:
axis off
axis off
</syntaxhighlight>
</syntaxhighlight>
[[File: TSNE Study Cohort.png|600px]]
 
[[File: TSNE Study Cohort 2kvars.png|600px]]
<big>Top 100 variants</big>
[[File: TSNE Study Cohort.png|800px]]
 
<big>Top 2000 variants</big>
[[File: TSNE Study Cohort 2kvars.png|800px]]




Line 360: Line 368:
axis off
axis off
</syntaxhighlight>
</syntaxhighlight>
[[File: TSNE Sex.png|600px]]
 
[[File: TSNE Sex 2kvars.png|600px]]
<big>Top 100 variants</big>
[[File: TSNE Sex.png|800px]]
 
<big>Top 2000 variants</big>
[[File: TSNE Sex 2kvars.png|800px]]




Line 392: Line 404:
axis off
axis off
</syntaxhighlight>
</syntaxhighlight>
[[File: TSNE Age.png|600px]]
 
[[File: TSNE Age 2kvars.png|600px]]
<big>Top 100 variants</big>
[[File: TSNE Age.png|800px]]
 
<big>Top 2000 variants</big>
[[File: TSNE Age 2kvars.png|800px]]




Line 422: Line 438:
axis off
axis off
</syntaxhighlight>
</syntaxhighlight>
[[File: TSNE APOE.png|600px]]
 
[[File: TSNE APOE 2kvars.png|600px]]
<big>Top 100 variants</big>
[[File: TSNE APOE.png|800px]]
 
<big>Top 2000 variants</big>
[[File: TSNE APOE 2kvars.png|800px]]




Line 444: Line 464:
axis off
axis off
</syntaxhighlight>
</syntaxhighlight>
[[File: TSNE Consent 2kvars.png|600px]]
 
<big>Top 2000 variants</big>
[[File: TSNE Consent 2kvars.png|800px]]




Line 474: Line 496:


<br> <br> <br> <br> <br> <br> <br>
<br> <br> <br> <br> <br> <br> <br>
==Additional Genomics Analyses==
==Additional Genomics Analyses==
----
----

Revision as of 04:54, 6 February 2018

The Alzheimer's Disease Sequencing Project (ADSP) has made available WES data in vcf format. The following is a step-by-step analysis walkthrough of this dataset. Here the ultimate goal is to develop a platform for Alzheimer's Disease diagnosis based on genome sequencing information. The data for this analysis can be downloaded here:

ADSP_WES_VCF_LATEST_RELEASE.mat.

Both the dataset and the following analysis code are in MATLAB format. Note however there is an R package for reading .mat files. All MATLAB code and custom functions used in the demo walkthrough below can be downloaded here:

GENOS GITHUB CODE REPO
Other Analyses










tSNE : t-Distributed Stochastic Neighbor Embedding




% ######################################################################
%%       tSNE : t-Distributed Stochastic Neighbor Embedding
% ######################################################################
clc; close all; clear; rng('shuffle')
cd(fileparts(which('GENOS.m')));


MATDATA = 'ADSPdata.mat';
which(MATDATA)
load(MATDATA)

clearvars -except ADSP



%% CARBON COPY MAIN VARIABLES FROM ADSP.STRUCT

LOCI = ADSP.LOCI(:,1:17);
CASE = ADSP.CASE;
CTRL = ADSP.CTRL;
PHEN = ADSP.PHEN;

clearvars -except ADSP LOCI CASE CTRL PHEN





%###############################################################
%%       DETERMINE WHICH PARTICIPANTS TO KEEP
%###############################################################



PHE = PHEN(PHEN.TOTvars>14000,:);


PHECASE = PHE(PHE.AD==1,:);
PHECTRL = PHE(PHE.AD==0,:);


clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL









%###############################################################
%%          COUNT NUMBER OF VARIANTS PER LOCI
%###############################################################

% The varsum() function will go through each known variant loci
% and check whether anyone's SRR ID from your subset of IDs match
% all known SRR IDs for that loci. It will then sum the total
% number of alleles (+1 for hetzy-alt, +2 for homzy-alt) for each
% loci and return the totals.


[CASEN, CTRLN] = varsum(CASE, PHECASE.SRR, CTRL, PHECTRL.SRR);


% SAVE COUNTS AS NEW TABLE COLUMNS
LOCI.CASEREFS = numel(PHECASE.SRR)*2-CASEN;
LOCI.CTRLREFS = numel(PHECTRL.SRR)*2-CTRLN;
LOCI.CASEALTS = CASEN;
LOCI.CTRLALTS = CTRLN;


clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL







%###############################################################
%%               COMPUTE FISHER'S P-VALUE
%###############################################################


% COMPUTE FISHERS STATISTICS FOR THE TRAINING GROUP
[FISHP, FISHOR] = fishp_mex(LOCI.CASEREFS,LOCI.CASEALTS,...
                            LOCI.CTRLREFS,LOCI.CTRLALTS);

LOCI.FISHPS  = FISHP;
LOCI.FISHORS = FISHOR;


clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL





%% MAKE LATEST COUNTS THE MAIN TABLE STATS

LOCI.CASEREF = LOCI.CASEREFS;
LOCI.CTRLREF = LOCI.CTRLREFS;
LOCI.CASEALT = LOCI.CASEALTS;
LOCI.CTRLALT = LOCI.CTRLALTS;
LOCI.FISHP   = LOCI.FISHPS;
LOCI.FISHOR  = LOCI.FISHORS;






%% SORT VARIANT LOCI TABLE BY FISHER P-VALUE

[X,i] = sort(LOCI.FISHP);

LOCI  = LOCI(i,:);
CASE  = CASE(i);
CTRL  = CTRL(i);
LOCI.VID = (1:size(LOCI,1))';

LOCI.GENE = string(LOCI.GENE);



clc; clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
disp(LOCI(1:9,:))





%% STORE VARIABLES FOR PCA/TSNE AS 'AMX'

AMX         = LOCI;
AMXCASE     = CASE;
AMXCTRL     = CTRL;


clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL 





%% FILTER VARIANTS BASED ALT > REF

PASS = (AMX.CASEREF > AMX.CASEALT./1.5) | (AMX.CTRLREF > AMX.CTRLALT./1.5);
sum(~PASS)

AMX      = AMX(PASS,:);
AMXCASE  = AMXCASE(PASS);
AMXCTRL  = AMXCTRL(PASS);
AMX.VID  = (1:size(AMX,1))';




clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL 





%% TAKE THE TOP N NUMBER OF VARIANTS
N = 100;


AMX      = AMX(1:N,:);
AMXCASE  = AMXCASE(1:N);
AMXCTRL  = AMXCTRL(1:N);
AMX.VID  = (1:size(AMX,1))';

fprintf('\n %.0f final loci count \n\n',size(AMX,1))

clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL 








%% MAKE  RECTANGLE  NN VARIANT MATRIX


[ADNN, caMX, coMX] = varmx(AMX,AMXCASE,AMXCTRL,PHE);

clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL ADNN 









%% RANDOMIZE ADNN AND REORDER PHE TO MATCH ADNN

ADL = ADNN(1,:);
ADN = ADNN(2:end,:);
i = randperm(size(ADN,1));
ADN = ADN(i,:);
ADNN = [ADL;ADN];


[i,j] = ismember(PHE.SRR, ADN(:,1) );
PHE.USED = i;
PHE.ORDER = j;
PHE = PHE(PHE.USED,:);
PHE = sortrows(PHE,'ORDER');



PCAMX = ADNN(2:end,4:end);


clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL ADNN PCAMX  




%% (OPTIONAL) PRE-PERFORM PCA BEFORE TSNE 

% ss = statset('pca');
% ss.Display = 'iter';
% ss.MaxIter = 100;
% ss.TolFun = 1e4;
% ss.TolX = 1e4;
% ss.UseParallel = true;
% 
% [PCAC,PCAS,~,~,~] = pca(  PCAMX' , 'Options',ss);
% clc; close all; scatter(PCAC(:,1),PCAC(:,2))
%
% % ...,'NumPCAComponents',0,...  means don't use PCA
% tSN = tsne(PCAC(:,1:10),'NumDimensions',2,'Theta',.6,'NumPCAComponents',0);
% 
% clearvars -except ADSP GENB LOCI CASE CTRL PHEN AMX AMXCASE AMXCTRL...
% PHE ADNN PCAMX tSN PCAC PCAS






%######################################################################
%%       tSNE : t-Distributed Stochastic Neighbor Embedding
%######################################################################



tSN = tsne(PCAMX,'NumDimensions',2,'Theta',.6,'NumPCAComponents',8);


disp('done')
clearvars -except ADSP GENB LOCI CASE CTRL PHEN AMX AMXCASE AMXCTRL...
PHE ADNN PCAMX tSN PCAC PCAS




t-SNE Plots






ALZHEIMER'S STATUS

%% PLOT TSNE --- ALZHEIMER'S STATUS (CASE/CTRL) --------------------------
close all; 
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');

ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.AD, [],'.',15);

title({'\fontsize{16} t-SNE : CASE vs CTRL',' '})
legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off

Top 100 variants Error creating thumbnail: File missing

Top 2000 variants Error creating thumbnail: File missing




STUDY COHORT

%% PLOT TSNE --- CONSORTIUM STUDY COHORT (1:24) -------------------------
close all; 
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');

ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.COHORT, [],'.',15);


title({'\fontsize{16} t-SNE : STUDY COHORT',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off

Top 100 variants Error creating thumbnail: File missing

Top 2000 variants Error creating thumbnail: File missing



SEX

%% PLOT TSNE --- SEX (M/F) ----------------------------------------------
close all; 
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');

ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.SEX, [],'.',15);


title({'\fontsize{16} t-SNE : SEX',' '})
legend(ph1,{'Male','Female'},'FontSize',12,'Box','off','Location','NorthWest');
axis off

Top 100 variants Error creating thumbnail: File missing

Top 2000 variants Error creating thumbnail: File missing



AGE

%% PLOT TSNE --- AGE (BINNED AGE) ---------------------------------------
close all; 
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');

AGE = round(PHE.AGE);
ofAGE = AGE>60;
A = AGE(ofAGE);

histogram(AGE)

[Y,E] = discretize(A,[60 80 90 91]);
% [Y,E] = discretize(A,[60 75 85 90 91]);
for nn = 1:numel(E)
A(Y==nn) = E(nn);
end

ph1 = gscatter(tSN(ofAGE,1),tSN(ofAGE,2),  A, [],'.',15);


title({'\fontsize{16} t-SNE : AGE',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off

Top 100 variants Error creating thumbnail: File missing

Top 2000 variants Error creating thumbnail: File missing



APOE STATUS

%% PLOT TSNE --- APOE STATUS (22,23,24,33,34,44) ------------------------
close all; 
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');


ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.APOE, [],'.',15);


ph1(1).MarkerSize = 35;
ph1(2).MarkerSize = 25;
ph1(2).Color = [.20 .20 .99];
ph1(3).MarkerSize = 35;
ph1(4).Color = [.99 .50 .10];
ph1(5).Color = [.30 .70 .80];
ph1(6).MarkerSize = 25;

title({'\fontsize{16} t-SNE : APOE',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off

Top 100 variants Error creating thumbnail: File missing

Top 2000 variants Error creating thumbnail: File missing



CONSENT GROUP

%% PLOT TSNE --- CONSENT GROUP ------------------------------------------
close all; 
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');


ph1 = gscatter(tSN(:,1),tSN(:,2),  PHE.RD, [],'.',15);


title({'\fontsize{16} t-SNE : CONSENT GROUP',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off

Top 2000 variants Error creating thumbnail: File missing






















Additional Genomics Analyses




Other Analyses










Notes


Category:ADSP