ADSP t-SNE
The Alzheimer's Disease Sequencing Project (ADSP) has made available WES data in vcf format. The following is a step-by-step analysis walkthrough of this dataset. Here the ultimate goal is to develop a platform for Alzheimer's Disease diagnosis based on genome sequencing information. The data for this analysis can be downloaded here:
Both the dataset and the following analysis code are in MATLAB format. Note however there is an R package for reading .mat files. All MATLAB code and custom functions used in the demo walkthrough below can be downloaded here:
tSNE : t-Distributed Stochastic Neighbor Embedding
% ######################################################################
%% tSNE : t-Distributed Stochastic Neighbor Embedding
% ######################################################################
clc; close all; clear; rng('shuffle')
cd(fileparts(which('GENOS.m')));
MATDATA = 'ADSPdata.mat';
which(MATDATA)
load(MATDATA)
clearvars -except ADSP
%% CARBON COPY MAIN VARIABLES FROM ADSP.STRUCT
LOCI = ADSP.LOCI(:,1:17);
CASE = ADSP.CASE;
CTRL = ADSP.CTRL;
PHEN = ADSP.PHEN;
clearvars -except ADSP LOCI CASE CTRL PHEN
%###############################################################
%% DETERMINE WHICH PARTICIPANTS TO KEEP
%###############################################################
PHE = PHEN(PHEN.TOTvars>14000,:);
PHECASE = PHE(PHE.AD==1,:);
PHECTRL = PHE(PHE.AD==0,:);
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
%###############################################################
%% COUNT NUMBER OF VARIANTS PER LOCI
%###############################################################
% The varsum() function will go through each known variant loci
% and check whether anyone's SRR ID from your subset of IDs match
% all known SRR IDs for that loci. It will then sum the total
% number of alleles (+1 for hetzy-alt, +2 for homzy-alt) for each
% loci and return the totals.
[CASEN, CTRLN] = varsum(CASE, PHECASE.SRR, CTRL, PHECTRL.SRR);
% SAVE COUNTS AS NEW TABLE COLUMNS
LOCI.CASEREFS = numel(PHECASE.SRR)*2-CASEN;
LOCI.CTRLREFS = numel(PHECTRL.SRR)*2-CTRLN;
LOCI.CASEALTS = CASEN;
LOCI.CTRLALTS = CTRLN;
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
%###############################################################
%% COMPUTE FISHER'S P-VALUE
%###############################################################
% COMPUTE FISHERS STATISTICS FOR THE TRAINING GROUP
[FISHP, FISHOR] = fishp_mex(LOCI.CASEREFS,LOCI.CASEALTS,...
LOCI.CTRLREFS,LOCI.CTRLALTS);
LOCI.FISHPS = FISHP;
LOCI.FISHORS = FISHOR;
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
%% MAKE LATEST COUNTS THE MAIN TABLE STATS
LOCI.CASEREF = LOCI.CASEREFS;
LOCI.CTRLREF = LOCI.CTRLREFS;
LOCI.CASEALT = LOCI.CASEALTS;
LOCI.CTRLALT = LOCI.CTRLALTS;
LOCI.FISHP = LOCI.FISHPS;
LOCI.FISHOR = LOCI.FISHORS;
%% SORT VARIANT LOCI TABLE BY FISHER P-VALUE
[X,i] = sort(LOCI.FISHP);
LOCI = LOCI(i,:);
CASE = CASE(i);
CTRL = CTRL(i);
LOCI.VID = (1:size(LOCI,1))';
LOCI.GENE = string(LOCI.GENE);
clc; clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL
disp(LOCI(1:9,:))
%% STORE VARIABLES FOR PCA/TSNE AS 'AMX'
AMX = LOCI;
AMXCASE = CASE;
AMXCTRL = CTRL;
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL
%% FILTER VARIANTS BASED ALT > REF
PASS = (AMX.CASEREF > AMX.CASEALT./1.5) | (AMX.CTRLREF > AMX.CTRLALT./1.5);
sum(~PASS)
AMX = AMX(PASS,:);
AMXCASE = AMXCASE(PASS);
AMXCTRL = AMXCTRL(PASS);
AMX.VID = (1:size(AMX,1))';
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL
%% TAKE THE TOP N NUMBER OF VARIANTS
N = 100;
AMX = AMX(1:N,:);
AMXCASE = AMXCASE(1:N);
AMXCTRL = AMXCTRL(1:N);
AMX.VID = (1:size(AMX,1))';
fprintf('\n %.0f final loci count \n\n',size(AMX,1))
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL
%% MAKE RECTANGLE NN VARIANT MATRIX
[ADNN, caMX, coMX] = varmx(AMX,AMXCASE,AMXCTRL,PHE);
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL ADNN
%% RANDOMIZE ADNN AND REORDER PHE TO MATCH ADNN
ADL = ADNN(1,:);
ADN = ADNN(2:end,:);
i = randperm(size(ADN,1));
ADN = ADN(i,:);
ADNN = [ADL;ADN];
[i,j] = ismember(PHE.SRR, ADN(:,1) );
PHE.USED = i;
PHE.ORDER = j;
PHE = PHE(PHE.USED,:);
PHE = sortrows(PHE,'ORDER');
PCAMX = ADNN(2:end,4:end);
clearvars -except ADSP LOCI CASE CTRL PHEN PHE PHECASE PHECTRL...
AMX AMXCASE AMXCTRL ADNN PCAMX
%% (OPTIONAL) PRE-PERFORM PCA BEFORE TSNE
% ss = statset('pca');
% ss.Display = 'iter';
% ss.MaxIter = 100;
% ss.TolFun = 1e4;
% ss.TolX = 1e4;
% ss.UseParallel = true;
%
% [PCAC,PCAS,~,~,~] = pca( PCAMX' , 'Options',ss);
% clc; close all; scatter(PCAC(:,1),PCAC(:,2))
%
% % ...,'NumPCAComponents',0,... means don't use PCA
% tSN = tsne(PCAC(:,1:10),'NumDimensions',2,'Theta',.6,'NumPCAComponents',0);
%
% clearvars -except ADSP GENB LOCI CASE CTRL PHEN AMX AMXCASE AMXCTRL...
% PHE ADNN PCAMX tSN PCAC PCAS
%######################################################################
%% tSNE : t-Distributed Stochastic Neighbor Embedding
%######################################################################
tSN = tsne(PCAMX,'NumDimensions',2,'Theta',.6,'NumPCAComponents',8);
disp('done')
clearvars -except ADSP GENB LOCI CASE CTRL PHEN AMX AMXCASE AMXCTRL...
PHE ADNN PCAMX tSN PCAC PCAS
t-SNE Plots
ALZHEIMER'S STATUS
%% PLOT TSNE --- ALZHEIMER'S STATUS (CASE/CTRL) --------------------------
close all;
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
ph1 = gscatter(tSN(:,1),tSN(:,2), PHE.AD, [],'.',15);
title({'\fontsize{16} t-SNE : CASE vs CTRL',' '})
legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off
Error creating thumbnail: File missing Error creating thumbnail: File missing
STUDY COHORT
%% PLOT TSNE --- CONSORTIUM STUDY COHORT (1:24) -------------------------
close all;
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
ph1 = gscatter(tSN(:,1),tSN(:,2), PHE.COHORT, [],'.',15);
title({'\fontsize{16} t-SNE : STUDY COHORT',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off
Error creating thumbnail: File missing Error creating thumbnail: File missing
SEX
%% PLOT TSNE --- SEX (M/F) ----------------------------------------------
close all;
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
ph1 = gscatter(tSN(:,1),tSN(:,2), PHE.SEX, [],'.',15);
title({'\fontsize{16} t-SNE : SEX',' '})
legend(ph1,{'Male','Female'},'FontSize',12,'Box','off','Location','NorthWest');
axis off
Error creating thumbnail: File missing Error creating thumbnail: File missing
AGE
%% PLOT TSNE --- AGE (BINNED AGE) ---------------------------------------
close all;
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
AGE = round(PHE.AGE);
ofAGE = AGE>60;
A = AGE(ofAGE);
histogram(AGE)
[Y,E] = discretize(A,[60 80 90 91]);
% [Y,E] = discretize(A,[60 75 85 90 91]);
for nn = 1:numel(E)
A(Y==nn) = E(nn);
end
ph1 = gscatter(tSN(ofAGE,1),tSN(ofAGE,2), A, [],'.',15);
title({'\fontsize{16} t-SNE : AGE',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off
Error creating thumbnail: File missing Error creating thumbnail: File missing
APOE STATUS
%% PLOT TSNE --- APOE STATUS (22,23,24,33,34,44) ------------------------
close all;
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
ph1 = gscatter(tSN(:,1),tSN(:,2), PHE.APOE, [],'.',15);
ph1(1).MarkerSize = 35;
ph1(2).MarkerSize = 25;
ph1(2).Color = [.20 .20 .99];
ph1(3).MarkerSize = 35;
ph1(4).Color = [.99 .50 .10];
ph1(5).Color = [.30 .70 .80];
ph1(6).MarkerSize = 25;
title({'\fontsize{16} t-SNE : APOE',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off
Error creating thumbnail: File missing Error creating thumbnail: File missing
CONSENT GROUP
%% PLOT TSNE --- CONSENT GROUP ------------------------------------------
close all;
fh1=figure('Units','normalized','Position',[.05 .05 .70 .84],'Color','w');
ax1=axes('Position',[.05 .02 .9 .9],'Color','none');
ph1 = gscatter(tSN(:,1),tSN(:,2), PHE.RD, [],'.',15);
title({'\fontsize{16} t-SNE : CONSENT GROUP',' '})
% legend(ph1,{'CTRL','CASE'},'FontSize',12,'Box','off','Location','NorthWest');
axis off
Error creating thumbnail: File missing
Additional Genomics Analyses