Commit b46b9304 authored by Antonio Politi's avatar Antonio Politi
Browse files

Protein class FCS, workflow, and master file

parent aff1f302
function master_workflow_fit_protein
% MASTER_WORKFLOW_FIT_DYE call workflow_fit_dye for several directorie
% specified in the script
%
% Antonio Politi, EMBL, January 2017
wd = 'Z:\AntonioP_elltier1\CelllinesImaging\MitoSysPipelines';
force = 0;
%wd = 'Z:\AntonioP_elltier1\CelllinesImaging\MitoSysPipelines\160308_gfpNUP107z26z31\';
cd(wd);
if ~exist(fullfile(wd, 'ProcessingHelpFiles'));
mkdir(fullfile(wd, 'ProcessingHelpFiles'));
end
if ~exist(fullfile(wd, 'ProcessingHelpFiles', 'protein2cfiles.mat')) || force
[indirCell] = getAllFolders(wd, '\Mito(s|S)ys\d');
resfiles = [];
for i = 1:length(indirCell)
resfiles = [resfiles; getAllFiles(indirCell{i}, 'res', '2c.res', 0)];
end
save(fullfile(wd, 'ProcessingHelpFiles', 'protein2cfiles.mat'), 'resfiles');
else
load(fullfile(wd, 'ProcessingHelpFiles', 'protein2cfiles.mat'));
end
for ifile = 2:length(resfiles)
resfile = resfiles{ifile};
display(sprintf('master_workflow_fit_proteins processing res file %d/%d %s', ifile, length(resfiles), resfiles{ifile}))
workflow_fit_protein(resfile, '2c');
end
\ No newline at end of file
classdef proteinFcsmodel < absFcsmodel
% PROTEINFCSMODEL class to fit auto correlation of protein
% The workflow is a one component fit and a 2 component fit of 1 or 2
% channels
properties (Access = public)
% these are the default parameters for a protein
par = struct('N', 10, 'thetaT', 0.2, 'tauT', 100, 'f1', 1, 'tauD1', 500, 'alpha1', 1, 'tauD2', 5000, 'alpha2', 1, 'kappa', 5.5);
lb = struct('N', 0.0001, 'thetaT', 0.001,'tauT', 0, 'f1', 0, 'tauD1', 100, 'alpha1', 0.5, 'tauD2', 500, 'alpha2', 0.5, 'kappa', 1);
hb = struct('N', 100000, 'thetaT', 1, 'tauT', 1000, 'f1', 1, 'tauD1', 50000,'alpha1', 1.2, 'tauD2', 5*10^6, 'alpha2', 2, 'kappa', 20);
diffCoeff = [464, 521]; %Alexe488 at 37 degrees, Alexe568 at 37 degrees Malte Wachsmuth SPIM FCS measurement
tauBoundary = []; %boundary of where to fit the data if empty fit the range
model = 0;
end
methods
function MO = proteinFcsmodel(parnames, parvalues, lbvalues, hbvalues)
% PROTEINFCSMODEL constructor
% MO = proteinFcsmodel() - just create class with default parameter values and boundary values
% MO = proteinFcsmodel(parnames, parvalues) - create class and assign parvalues to parameters named parnames
% MO = proteinFcsmodel(parnames, parvalues, lbvalues)- create class and assign parvalues, lower boundary to parameter named in parnames.
% MO = proteinFcsmodel(parnames, parvalues, lbvalues,hbvalues) - create class and assign parvalues, lower and higher boundary to parameter named in parnames.
% parnames - cell array containing name of parameteres to be changed
% parvalues - vector with values of parameters same length than parnames
% lbvalues - vector with values of low boundary parameters same length than parnames
% hbvalues - vector with values of high boundary parameters same length than parnames
% we cannot call MO.par in the constructor before call of super class this is the reason for the repetitions of the par values
par = struct('N', 10, 'thetaT', 0.2, 'tauT', 100, 'f1', 1, 'tauD1', 500, 'alpha1', 1, 'tauD2', 5000, 'alpha2', 1, 'kappa', 5.5);
lb = struct('N', 0.0001, 'thetaT', 0.001,'tauT', 0, 'f1', 0, 'tauD1', 100, 'alpha1', 0.5, 'tauD2', 500, 'alpha2', 0.5, 'kappa', 1);
hb = struct('N', 100000, 'thetaT', 1, 'tauT', 1000, 'f1', 1, 'tauD1', 50000,'alpha1', 1.2, 'tauD2', 5*10^6, 'alpha2', 2, 'kappa', 20);
if nargin < 1
parnames = fieldnames(par);
end
if nargin < 2
parvalues = struct2array(par);
end
if nargin < 3
lbvalues = struct2array(lb);
end
if nargin < 4
hbvalues = struct2array(hb);
end
MO@absFcsmodel(parnames, parvalues, lbvalues, hbvalues)
end
function G = Gcomp(MO, par, tau)
% GCOMP returns ACF for dye like blinking
% par - struct or full vector of parameter values
% tau - time points to evaluate function
if ~isstruct(par)
par = MO.parvecToparstruct(par);
end
triplet = (1 - par.thetaT + par.thetaT*exp(-tau/par.tauT));
comp1 = ((1+(tau/par.tauD1).^par.alpha1).^-1).*((1+1/par.kappa^2*(tau/par.tauD1).^par.alpha1).^-0.5);
comp2 = ((1+(tau/par.tauD2).^par.alpha2).^-1).*((1+1/par.kappa^2*(tau/par.tauD2).^par.alpha2).^-0.5);
G = 1/par.N*triplet.*(par.f1*comp1 + (1-par.f1)*comp2);
end
function [outPar, norm, R2, norms, R2s, Nrpt, nrpara] = fitOneTwoComponents(MO, data, XC, twoC)
% FITONETWOCOMPONENTS fit model with one or 2 components
% twoC == 1: fit N, tauD1, alpha1, tauD2, alpha2, f1, (thetaT)
% twoC == 0: fit N, tauD1, alpha1, (thetaT)
% tauT is kept constant at default value
% INPUT:
% data - correlation data [tau, corr]
% XC - 0 or 1 whether data is cross-correlation. In
% this case thetaT = 0
% twoC - 0 or 1 depending whether we fit a one
% component model
% OUTPUT:
% outPar - all parameter values of the model as vector
% norm - norm at optimum
% R2 - R2 at optimum
% norms - unweighted and weighted norm
% R2s - unweighted and weighted R2
% Nrpt - Nrpoints fitted
% nrpara - number of parameters fitted
% Workflow is firts to fit N, tauD1, tauD2, f1; then N,tauD1,
% alpha1, tauD2, alpha2, f1; finally N, tauD1, alpha1, tauD2,
% alpha2, f1, thetaT
if nargin < 3
XC = 0;
end
idxboundary = MO.getDataIndexes(data, MO.tauBoundary);
data = data(idxboundary(1):end-idxboundary(2),:);
%% initial parameters
pari = MO.par;
if XC
pari.thetaT = 0;
end
pari.f1 = 0.5*(twoC==1) + 1*(twoC==0);
%% fit workflow
% find initial value for N, tauD1, (tauD2, f1) and repeat
for itry = 1:MO.maxfittry
[N, tauD1, tauD2] = MO.initialGuess(data, MO.tauBoundary); % this includes some random variations of parameters
if twoC
[parf{itry}, normT(itry)] = MO.fitmodel([N, tauD1, tauD2, pari.f1+(1-2*rand)*pari.f1/8], {'N', 'tauD1', 'tauD2', 'f1'}, pari, data);
else
[parf{itry}, normT(itry)] = MO.fitmodel([N, tauD1], {'N', 'tauD1'}, pari, data, MO.tauBoundary);
end
end
parf1 = MO.getbestfit(parf, normT);
% fit N, tauD1, alpha1, (tauD2, alpha2, f1)
for itry = 1:MO.maxfittry
if twoC
[parf{itry}, normT(itry)] = MO.fitmodel([parf1.N, parf1.tauD1, parf1.tauD2, ...
parf1.f1, pari.alpha1+(1-2*rand)*pari.alpha1/8, pari.alpha2+(1-2*rand)*pari.alpha2/8], {'N', 'tauD1', 'tauD2', 'f1', 'alpha1', 'alpha2'}, pari, data);
else
[parf{itry}, normT(itry)] = MO.fitmodel([parf1.N, parf1.tauD1, pari.alpha1+(1-2*rand)*pari.alpha1/8], {'N', 'tauD1', 'alpha1'}, pari, data, MO.tauBoundary);
end
end
parf1 = MO.getbestfit(parf, normT);
% fit N, tauD1, alpha1, (tauD2, alpha2, f1) and thetaT
if ~XC
for itry = 1:MO.maxfittry
if twoC
[parf{itry}, normT(itry)] = MO.fitmodel([parf1.N, parf1.tauD1, parf1.tauD2, ...
parf1.f1, parf1.alpha1, parf1.alpha2, pari.thetaT+(1-2*rand)*pari.thetaT/8], {'N', 'tauD1', 'tauD2', 'f1', 'alpha1', 'alpha2', 'thetaT'}, pari, data);
else
[parf{itry}, normT(itry)] = MO.fitmodel([parf1.N, parf1.tauD1, parf1.alpha1 , pari.thetaT+(1-2*rand)*pari.thetaT/8, ...
], {'N', 'tauD1', 'alpha1', 'thetaT'}, parf1, data);
end
end
end
[parf2, norm] = MO.getbestfit(parf, normT);
%% format the output
outPar(1,:) = struct2array(parf2);
R2 = MO.compR2(data, outPar);
% sum of squares unweighted and weighted
norms(1) = sum(MO.Gdiffcomp(parf2, data, 0).^2);
norms(2) = sum(MO.Gdiffcomp(parf2, data, 1).^2);
% R2 unweighted and weighted
R2s(1) = MO.compR2(data, parf2, [], 0);
R2s(2) = MO.compR2(data, parf2, [], 1);
Nrpt = length(data);
if XC
nrpara = 6*(twoC==1) + 3*(twoC==0);
else
nrpara = 7*(twoC==1) + 4*(twoC==0);
end
end
function [outPar, norm, R2, norms, R2s, Nrpt, nrpara] = fitTwoComponents(MO, data, XC)
% FITTWOCOMPONENTS fit N, tauD1, alpha1, tauD2, alpha2, f1, thetaT
% INPUT:
% data - correlation data [tau, corr]
% XC - 0 or 1 whether data is cross-correlation. In
% this case thetaT = 0
% OUTPUT:
% outPar - all parameter values of the model as vector
% norm - norm at optimum
% R2 - R2 at optimum
% norms - unweighted and weighted norm
% R2s - unweighted and weighted R2
% Nrpt - Nrpoints fitted
% nrpara - number of parameters fitted
%
% Workflow is firts to fit N, tauD1, tauD2, f1; then N,tauD1,
% alpha1, tauD2, alpha2, f1; finally N, tauD1, alpha1, tauD2,
% alpha2, f1, thetaT
if nargin < 3
XC = 0;
end
[outPar, norm, R2, norms, R2s, Nrpt, nrpara] = fitOneTwoComponents(MO, data, XC, 1);
end
function [outPar, norm, R2, norms, R2s, Nrpt, nrpara] = fitOneComponent(MO, data, XC)
% FITONECOMPONENT fit N, tauD1, alpha1, thetaT. tauT is kept
% constant at default value
% INPUT:
% data - correlation data [tau, corr]
% XC - 0 or 1 whether data is cross-correlation. In
% this case thetaT = 0
% OUTPUT:
% outPar - all parameter values of the model as vector
% norm - norm at optimum
% R2 - R2 at optimum
% norms - unweighted and weighted norm
% R2s - unweighted and weighted R2
% Nrpt - Nrpoints fitted
% nrpara - number of parameters fitted
%
% Workflow is firts to fit N, tauD1; then N,tauD1, alpha1;
% then N, tauD1, alpha1, thetaT. If cross
if nargin < 3
XC = 0;
end
[outPar, norm, R2, norms, R2s, Nrpt, nrpara] = fitOneTwoComponents(MO, data, XC, 0);
end
end
end
function workflow_fit_protein(resfile, FAsession)
% WORKFLOW_FIT_PROTEIN perform fitting of protein data using as input a res file from FA
% INPUT:
% resfile - a result file containing the directory where to extract files
% FAsession - session name
% OUTPUT:
%
% Antonio Politi, EMBL, January 2017
%% default values
if nargin < 1
% prompt for a directory
[resfile, pathname] = uigetfile({'*.res'; '*.*'},'Select res file', 'C:\Users\toni\Dropbox\NPCMaturation\matlabcode\FCS\example_data\Protein');
%[resfile, pathname] = uigetfile({'*.res'; '*.*'},'Select res file', 'Z:\AntonioP_elltier1\CelllinesImaging\MitoSysPipelines\160308_gfpNUP107z26z31\MitoSys2');
if ~resfile
return
end
resfile = fullfile(pathname, resfile);
end
if nargin < 2
FAsession = '2c';
end
%% Initialization
% load model class
clear('MO');
MO = proteinFcsmodel();
% read FA result table
restable2c = readtable(resfile, 'Delimiter', '\t', 'Header', 1, 'FileType', 'text', 'TreatAsEmpty',{'NA'});
restable1c = restable2c;
% stat entries
restable1c.P11Name = repmat({'pValueLRT'}, length(restable1c.P11Name),1);
restable1c.P12Name = repmat({'pValueFtest'}, length(restable1c.P11Name),1);
restable1c.P13Name = repmat({'PAIC'}, length(restable1c.P11Name),1);
restable1c.P14Name = repmat({'BIC'}, length(restable1c.P11Name),1);
restable2c.P11Name = repmat({'pValueLRT'}, length(restable1c.P11Name),1);
restable2c.P12Name = repmat({'pValueFtest'}, length(restable1c.P11Name),1);
restable2c.P13Name = repmat({'PAIC'}, length(restable1c.P11Name),1);
restable2c.P14Name = repmat({'BIC'}, length(restable1c.P11Name),1);
%% this write empty entries as NaN (must double check afterwards in particular for annotation)
%%
% get names of xml files
xmlFAfiles = strrep(restable2c.FullPath, ['.zen'], ['.zen.' FAsession '.xml']);
%% read focal volume data
focVol = getAllFiles(fileparts(resfile), 'txt', 'focalVolume');
if length(focVol) == 0
display(['No focalVolume file associated with ' resfile]);
return;
end
if length(focVol) > 1
display(['several focalVolume files associated with ' resfile]);
return;
end
focVol = readtable(focVol{1}, 'Delimiter', '\t');
%%
if isempty(xmlFAfiles)
display('No FA xml found use .fcs file instead');
cond = 3;
else
cond = 1;
end
switch cond
case 1 %fit using data from FA weighted with standard deviation
MO.weightChi2 = 1;
ZEN = 0;
case 2 %fit using data from FA unweighted
MO.weightChi2 = 0;
ZEN = 0;
case 3 %fit using data from .fcs ZEN files
MO.weightChi2 = 0; % this is unweighted by definition as std is not available for .fcs
ZEN = 1;
end
%% define file prefix for images
if ZEN
prefix = 'parZEN';
else
prefix = 'parFAcorr';
end
if ~MO.weightChi2
prefix = [prefix '_unw'];
end
%% load data into cell array dataC
display(['Load correlation data for ' resfile])
if ZEN
dataC = MO.readZeissfcs(zenfiles{1});
else
dataC = MO.readFAxml(xmlFAfiles);
end
%% Workflow
MO.maxfittry = 3; %3 repetitions for each fit
MO.weightChi2 = 0;
weightStr = {'','_w'};
% in FA seems more that unweighted fit is used. The residuals look most
% close to each other when unweighted is used. Also the first point is
% often out of fit. Unfortunately the exact measure on how the fit is
% performed is not available. If we take best fit here and plugin into FA
% the Chi2 slightly increases
for iD = 1:length(dataC)
display(sprintf('Fitting of data set %d/%d', iD, length(dataC)));
% Fit for all channels (if CC or XC is found)
outpar1c = repmat(struct2array(MO.par),3,1); % parameters for 1 component fit
outpar2c = repmat(struct2array(MO.par),3,1); % parameters for 2 component fit
for iCh = 1:3 %run for all channels
if iCh == 3
Chstr = 'Cross';
else
Chstr = ['Ch' num2str(iCh)];
end
if dataC{iD}(1,iCh*2) == 0 % exclude if no data available
continue
end
MO.par.kappa = focVol.kappa(iCh);
% norm1c is sum of squared residuals used in the fit. norms1c = [unweighted, weighted norm]
% R2_1c is from norm used for the fit. R2s_1c:[R2_unweighted,
% R2_weighted norm]
[outpar1c(iCh,:), norm1c, R2_1c, norms1c, R2s_1c, Nrpts1c, nrpara1c] = MO.fitOneComponent(dataC{iD}(:, [1 2*iCh 2*iCh+1]), iCh==3);
[outpar2c(iCh,:), norm2c, R2_2c, norms2c, R2s_2c, Nrpts2c, nrpara2c] = MO.fitTwoComponents(dataC{iD}(:,[1 2*iCh 2*iCh+1]), iCh==3);
% adjusted R2s as used in FA
R2adj_1c = 1 - (1-R2s_1c).*(Nrpts1c-1)./(Nrpts1c - nrpara1c - 1);
R2adj_2c = 1 - (1-R2s_2c).*(Nrpts2c-1)./(Nrpts2c - nrpara2c - 1);
%% compute model comparison statistics
% use for AIC the weighted norm (2nd endtry
[PAIC, BIC, pvalueChi, pvalueFtest] = modelCompare([norms1c(2), norms2c(2)], [Nrpts1c, Nrpts2c], [nrpara1c, nrpara2c]);
for ipar = 1:length(outpar2c)
% N, thetaT, tauT, f1, tauD1, alpha1, tauD2, alpha2, kappa
restable1c(iD, ['P0' num2str(ipar) 'Value' Chstr]) = table(outpar1c(iCh, ipar));
restable2c(iD, ['P0' num2str(ipar) 'Value' Chstr]) = table(outpar2c(iCh, ipar));
end
%% only report weighted norm
restable1c(iD, ['ChiSq' Chstr]) = table(norms1c(2)/Nrpts1c);
restable2c(iD, ['ChiSq' Chstr]) = table(norms2c(2)/Nrpts2c);
%report unweighted R2 as the weight are largely overestimated. This
restable1c(iD, ['RSqAdj' Chstr]) = table(R2adj_1c(1));
restable2c(iD, ['RSqAdj' Chstr]) = table(R2adj_2c(1));
restable1c(iD, ['P11Value' Chstr]) = table(pvalueChi);
restable1c(iD, ['P12Value' Chstr]) = table(pvalueFtest);
restable1c(iD, ['P13Value' Chstr]) = table(PAIC(1));
restable1c(iD, ['P14Value' Chstr]) = table(BIC(1));
restable2c(iD, ['P11Value' Chstr]) = table(pvalueChi);
restable2c(iD, ['P12Value' Chstr]) = table(pvalueFtest);
restable2c(iD, ['P13Value' Chstr]) = table(PAIC(2));
restable2c(iD, ['P14Value' Chstr]) = table(BIC(2));
end
% write updated xml file
for ifit = 1:2
node = xmlFA.readnode(xmlFAfiles{iD});
[path1, fout, ext] = fileparts(xmlFAfiles{iD});
[path, fout, ext] = fileparts(fout);
fname_out = fullfile(path1, sprintf('%s.%dc_opt%s.xml', fout, ifit, weightStr{MO.weightChi2+1}));
idxs = MO.getDataIndexes(dataC{iD}, MO.tauBoundary);
Ntfits = length(idxs(1):length(dataC{iD})-idxs(2));
fit = zeros(Ntfits, 3);
if ifit == 1
par = outpar1c;
else
par = outpar2c;
end
for iC = 1:3
if (dataC{iD}(1,iC*2) == 0) % do not process if channel has not been acquired
continue
end
fit(:,iC) = MO.Gcomp(MO.parvecToparstruct(par(iC,:)), dataC{iD}(idxs(1):end-idxs(2),1));
end
xmlFA.writepar(node, par, struct2array(MO.lb), struct2array(MO.hb));
xmlFA.writefit(node, fit, idxs, MO.model);
xmlFA.writenode(fname_out, node);
end
end
%%
tmpfiles = {fullfile(fileparts(resfile), ['1c_tmp.res']), ...
fullfile(fileparts(resfile), ['2c_tmp.res'])};
outfiles = {fullfile(fileparts(resfile), ['1c_proc_weight' num2str(MO.weightChi2) '.res']), ...
fullfile(fileparts(resfile), ['2c_proc_weight' num2str(MO.weightChi2) '.res'])};
tables = {restable1c, restable2c};
for ifile = 1:2
writetable(tables{ifile}, tmpfiles{ifile}, 'WriteVariableNames',false, 'Delimiter', '\t','FileType', 'text');
% prepend first 2 lines to keep compability
filepre = fopen(resfile);
fileout = fopen(outfiles{ifile}, 'w');
for i = 1:2
tline = fgetl(filepre);
fwrite( fileout, sprintf('%s\n', tline ) );
end
fclose(filepre);
fclose(fileout);
appendFiles(tmpfiles{ifile}, outfiles{ifile});
delete(tmpfiles{ifile});
end
end
function [PAIC, BIC, pvalueChi, pvalueF] = modelCompare(norms, nrpts, nrpara)
% MODELCOMPARE compute stats for comparing 1 and 2 component model
% corrected AIC (AIC will select tendentially larger models than BIC,
% so BIC is more strict)
AIC = 2*nrpara + norms + 2*(nrpara+1).*(nrpara)./(nrpts-nrpara-2);
PAIC = exp((min(AIC) - AIC)/length(AIC)); % probability of worse model to be correct
% BIC. The lowest value is better
BIC = log(nrpts).*nrpara + norms;
% log-likelihood ratio test
Dlog = norms(1) - norms(2);
df = nrpara(2) - nrpara(1);
pvalueChi = 1 - chi2cdf(Dlog, df);
%%
% F-test. Keep in mind that the sum(weighted_squares) is not ~ Nrpts.
% so we are overestimating the error.
% we can reassign the norms (but this is rather unorthodox)
fact = nrpts(1)/norms(1);
F = (Dlog/df)/(fact*norms(2)/(nrpts(1)-nrpara(2)));
pvalueF = 1 - fcdf(F, df, nrpts(1)-nrpara(2));
end
function status = appendFiles( readFile, writtenFile )
% APPENDFILES append readFile to writtenFile
fr = fopen( readFile, 'rt' );
fw = fopen( writtenFile, 'at' );
while feof( fr ) == 0
tline = fgetl( fr );
fwrite( fw, sprintf('%s\n',tline ) );
end
fclose(fr);
fclose(fw);
end
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment