######################################################################## # This file contains the R function diverstest, which implements # the first three tests described in Gilbert, Rossini, and # Shankarappa (2005), "Two-sample tests for comparing intra-individual # genetic sequence diversity between populations", Biometrics, 2005, # 61:107-118. # # The test statistics are Tpoolmean, Tpoolmedian, and Tsubjmean, # which are defined in formulas (6), (7), and (8) of # Gilbert, Rossini, and Shankarappa (2005), respectively. # # For each test statistic, the program prints out (to the screen and to # the file diverstest.out) the output of the the testing procedures # (including test statistic, p-value, estimates of mean or median # values, and 95% confidence intervals. ######################################################################## diverstest <- function(mat) { # mat is a matrix with 5 columns: # col 1 = strand1 identifies the sequence number for a given subject # col 2 = strand2 identifies the sequence number for the same given subject # col 3 = group membership: 1 = group 1; 2 = group 2 # col 4 = id: identifies the subject # col 5 = pairwise distance between strand1 and strand 2 for subject id strand1 <- mat[,1] strand2 <- mat[,2] group <- mat[,3] id <- mat[,4] dists <- mat[,5] M1 <- length(unique(id[group==1])) M2 <- length(unique(id[group==2])) # M1 (M2) number of subjects in group 1 (2) # Compute the mean diversity for each group: n <- nrow(mat) n1seqs <- rep(NA,M1) # vector of number of sequences per individual in group 1 n2seqs <- rep(NA,M2) # vector of number of sequences per individual in group 2 for (i in 1:M1) { n1seqs[i] <- max(strand2[group==1 & id==unique(id[group==1])[i]]) } for (i in 1:M2) { n2seqs[i] <- max(strand2[group==2 & id==unique(id[group==2])[i]]) } # n1seqs (n2seqs) a vector of numbers of sequences per person in group 1 (2) n1 <- sum(n1seqs*(n1seqs-1))/2 n2 <- sum(n2seqs*(n2seqs-1))/2 #n1pairs <- n1seqs*(n1seqs-1)/2 #n2pairs <- n2seqs*(n2seqs-1)/2 n1cov <- (1/6)*sum((n1seqs*(n1seqs-1)*(n1seqs-2))) n2cov <- (1/6)*sum((n2seqs*(n2seqs-1)*(n2seqs-2))) mu1hat <- mean(dists[group==1]) mu2hat <- mean(dists[group==2]) PM <- median(dists) # PM equals the median pairwise distances aggregated over both groups mu1hatvect <- rep(NA,M1) mu2hatvect <- rep(NA,M2) for (i in 1:M1) { mu1hatvect[i] <- mean(dists[group==1 & id==unique(id[group==1])[i]]) } for (i in 1:M2) { mu2hatvect[i] <- mean(dists[group==2 & id==unique(id[group==2])[i]]) } sigma12hat <- var(dists[group==1]) sigma22hat <- var(dists[group==2]) sigma12hatvect <- rep(NA,M1) sigma22hatvect <- rep(NA,M2) for (i in 1:M1) { sigma12hatvect[i] <- ifelse(length(dists[group==1 & id==unique(id[group==1])[i]])>1, var(dists[group==1 & id==unique(id[group==1])[i]]),0) } for (i in 1:M2) { sigma22hatvect[i] <- ifelse(length(dists[group==2 & id==unique(id[group==2])[i]])>1, var(dists[group==2 & id==unique(id[group==2])[i]]),0) } sigma11hat <- 0 sigma11hatvect <- rep(0,M1) for (iii in 1:M1) { for (k in 1:(n1seqs[iii]-2)) { for (kk in (k+1):(n1seqs[iii]-1)) { sigma11hatvect[iii] <- sigma11hatvect[iii] + sum(c((dists[group==1 & id==unique(id[group==1])[iii] & strand1==k & strand2==kk]-mu1hat)* (dists[group==1 & id==unique(id[group==1])[iii] & strand1==k & strand2 > kk]-mu1hat), (dists[group==1 & id==unique(id[group==1])[iii] & strand1==k & strand2==kk]-mu1hat)* (dists[group==1 & id==unique(id[group==1])[iii] & strand1==kk & strand2 > kk]-mu1hat))) }} sigma11hat <- sigma11hat + sigma11hatvect[iii] } sigma11hat <- max(sigma11hat/(2*n1cov-1),0) for (iii in 1:M1) { sigma11hatvect[iii] <- max(sigma11hatvect[iii]/((1/3)*(n1seqs[iii]*(n1seqs[iii]-1)*(n1seqs[iii]-2)-1)),0) } sigma21hat <- 0 sigma21hatmed <- 0 sigma21hatvect <- rep(0,M2) ind <- 0 for (iii in 1:M2) { for (k in 1:(n2seqs[iii]-2)) { for (kk in (k+1):(n2seqs[iii]-1)) { sigma21hatvect[iii] <- sigma21hatvect[iii] + sum(c((dists[group==2 & id==unique(id[group==2])[iii] & strand1==k & strand2==kk]-mu2hat)* (dists[group==2 & id==unique(id[group==2])[iii] & strand1==k & strand2 > kk]-mu2hat), (dists[group==2 & id==unique(id[group==2])[iii] & strand1==k & strand2==kk]-mu2hat)* (dists[group==2 & id==unique(id[group==2])[iii] & strand1==kk & strand2 > kk]-mu2hat))) sigma21hatmed <- sigma21hatmed + sum(c(ifelse(dists[group==2 & id==unique(id[group==2])[iii] & strand1==k & strand2==kk] <= PM & dists[group==2 & id==unique(id[group==2])[iii] & strand1==k & strand2 > kk] <= PM,1,0), ifelse(dists[group==2 & id==unique(id[group==2])[iii] & strand1==k & strand2==kk] <= PM & dists[group==2 & id==unique(id[group==2])[iii] & strand1==kk & strand2 > kk] <= PM,1,0))) }} sigma21hat <- sigma21hat + sigma21hatvect[iii] } sigma21hat <- max(sigma21hat/(2*n2cov - 1),0) sigma21hatmed <- max(sigma21hatmed/(2*n2cov - 1),0) for (iii in 1:M2) { sigma21hatvect[iii] <- max(sigma21hatvect[iii]/((1/3)*(n2seqs[iii]*(n2seqs[iii]-1)*(n2seqs[iii]-2)-1)),0) } ######################################### # Statistic based on the pooled means: # # Tpoolmean is given in formula (6) [T_poolmn] of # Gilbert, Rossini, and Shankarappa (2004) piece1 <- (1/n1)*(2*sum(n1seqs) -4*M1)*sigma11hat piece2 <- (1/n1)*sigma12hat var1hat <- piece1 + piece2 piece1 <- (1/n2)*(2*sum(n2seqs) -4*M2)*sigma21hat piece2 <- (1/n2)*sigma22hat var2hat <- piece1 + piece2 Tpoolmean <- (mu1hat - mu2hat)/(sqrt(var1hat + var2hat)) pvalueTpoolmean <- 2*(1-pnorm(abs(Tpoolmean))) ################################################ # Statistic based on the pooled group 2 median: # # Tpoolmedian is given in formula (7) [T_poolmed] of # Gilbert, Rossini, and Shankarappa (2004) phat <- length(dists[group==2 & dists <= PM])/n2 # phat is \hat P_med2 in Gilbert, Rossibi, and Shankarappa (2004) sigma22hatmed <- (n2/(n2-1))*phat*(1-phat) sigma21hatmed <- max((n2/(n2-1))*(sigma21hatmed - (phat**2)),0) piece12 <- (1/n2)*(2*sum(n2seqs) -4*M2)*sigma21hatmed piece22 <- (1/n2)*sigma22hatmed var2hatmed <- piece12 + piece22 Tpoolmedian <- (phat - 0.5)/(sqrt(var2hatmed)) pvalueTpoolmedian <- 2*(1-pnorm(abs(Tpoolmedian))) ############################################################################## # Statistic for subject-specific average intra-individual pairwise distances: # # Tsubjmean is given in formula (8) [T_subjmn] of # Gilbert, Rossini, and Shankarappa (2004) piece1 <- sum(((1/(n1seqs*(n1seqs-1)/2))*(2*(n1seqs-2)*sigma11hatvect + sigma12hatvect))) piece2 <- sum(((1/(n2seqs*(n2seqs-1)/2))*(2*(n2seqs-2)*sigma21hatvect + sigma22hatvect))) Tsubjmean <- (mean(mu1hatvect) - mean(mu2hatvect))/sqrt(((piece1/(M1^2)) + (piece2/(M2^2)))) pvalueTsubjmean <- 2*(1-pnorm(abs(Tsubjmean))) # Write out the results to the screen: cat("\n") cat(paste("T-test type test [pooled mean diversity test]"),"\n") cat(paste("Test statistic Tpoolmn for comparing pooled mean diversity = ",round(Tpoolmean,5)),"\n") cat(paste("2-sided p-value for pooled mean diversity test = ",round(pvalueTpoolmean,5)),"\n") cat(paste("Average diversity Group 1 = ",round(mu1hat,5)),"\n") cat(paste("Estimated variance of mean diversity Group 1 = ",round(var1hat,8)),"\n") cat(paste("Estimated correlation of distances Group 1 = ",round(sqrt(sigma11hat/sigma12hat),8)),"\n") cat(paste("Average diversity Group 2 = ",round(mu2hat,5)),"\n") cat(paste("Estimated variance of mean diversity Group 2 = ",round(var2hat,8)),"\n") cat(paste("Estimated correlation of distances Group 2 = ",round(sqrt(sigma21hat/sigma22hat),8)),"\n") cat(paste("Average difference in diversity (Group 1 - 2) = ",round(mu1hat-mu2hat,5)),"\n") cat(paste("95% CI for mean difference = ",round(mu1hat-mu2hat - 1.96*sqrt(var1hat+var2hat),5),",", round(mu1hat-mu2hat + 1.96*sqrt(var1hat+var2hat),5)),"\n") cat("\n") cat(paste("Rank-based test [pooled median diversity test]"),"\n") cat(paste("Test statistic Tpoolmed for comparing pooled median diversity = ",round(Tpoolmedian,5)),"\n") cat(paste("2-sided p-value for pooled median diversity test = ",round(pvalueTpoolmedian,5)),"\n") cat(paste("Median diversity Group 1 = ",round(median(dists[group==1]),5)),"\n") cat(paste("Median diversity Group 2 = ",round(median(dists[group==2]),5)),"\n") cat(paste("Pooled median for both groups combined = ",round(PM,5)),"\n") cat(paste("Probability a Group 2 pairwise distance <= Pooled median for both groups = ",round(phat,5)),"\n") cat(paste("Variance in denominator of Tpoolmedian = ",round(sigma22hatmed,8)),"\n") cat(paste("Estimated correlation Group 2 = ",round(sqrt(sigma21hatmed/sigma22hatmed),8)),"\n") cat("\n") cat(paste("Test statistic Tsubjmn for t-test of subject-specific averages = ",round(Tsubjmean,5)),"\n") cat(paste("2-sided p-value for t-test of subject-specific averages = ",round(pvalueTsubjmean,5)),"\n") cat(paste("Average diversity Group 1, subject-specific averages = ",round(mean(mu1hatvect),5)),"\n") cat(paste("Estimated variance of mean average diversities Group 1 = ",round((piece1/(M1^2)),8)),"\n") cat(paste("Average diversity Group 2, subject-specific averages = ",round(mean(mu2hatvect),5)),"\n") cat(paste("Estimated variance of mean average diversities Group 2 = ",round((piece2/(M2^2)),8)),"\n") cat(paste("Difference in mean diversities (Group 1 - 2) = ",round(mean(mu1hatvect)-mean(mu2hatvect),5)),"\n") cat(paste("95% CI for difference = ",round(mean(mu1hatvect)-mean(mu2hatvect)-1.96*sqrt((piece1/(M1^2))+(piece2/(M2^2))),5),",",round(mean(mu1hatvect)-mean(mu2hatvect)+1.96*sqrt((piece1/(M1^2))+(piece2/(M2^2))),5)),"\n") # Write the same information to the file diverstest.out: write(paste("T-test type test [pooled mean diversity test]"),file="diverstest.out",append=F) write(paste("Test statistic Tpoolmn for comparing pooled mean diversity = ",round(Tpoolmean,5)),file="diverstest.out",append=T) write(paste("2-sided p-value for pooled mean diversity test = ",round(pvalueTpoolmean,5)),file="diverstest.out",append=T) write(paste("Average diversity Group 1 = ",round(mu1hat,5)),file="diverstest.out",append=T) write(paste("Estimated cariance of mean diversity Group 1 = ",round(var1hat,5)),file="diverstest.out",append=T) write(paste("Estimated correlation of distances Group 1 = ",round(sqrt(sigma11hat/sigma12hat),5)),file="diverstest.out",append=T) write(paste("Average diversity Group 2 = ",round(mu2hat,5)),file="diverstest.out",append=T) write(paste("Estimated variance of mean diversity Group 2 = ",round(var2hat,5)),file="diverstest.out",append=T) write(paste("Estimated correlation of distances Group 2 = ",round(sqrt(sigma21hat/sigma22hat),5)),file="diverstest.out",append=T) write(paste("Average difference in diversity (Group 1 - 2) = ",round(mu1hat-mu2hat,5)),file="diverstest.out",append=T) write(paste("95% CI for mean difference = ",round(mu1hat-mu2hat - 1.96*sqrt(var1hat+var2hat),5),",", round(mu1hat-mu2hat + 1.96*sqrt(var1hat+var2hat),5)),file="diverstest.out",append=T) write(paste(""),file="diverstest.out",append=T) write(paste("Rank-based test [pooled median diversity test]"),file="diverstest.out",append=T) write(paste("Test statistic Tpoolmed for comparing pooled median diversity = ",round(Tpoolmedian,5)),file="diverstest.out",append=T) write(paste("2-sided p-value for pooled median diversity test = ",round(pvalueTpoolmedian,5)),file="diverstest.out",append=T) write(paste("Median diversity Group 1 = ",round(median(dists[group==1]),5)),file="diverstest.out",append=T) write(paste("Median diversity Group 2 = ",round(median(dists[group==2]),5)),file="diverstest.out",append=T) write(paste("Pooled median for both groups combined = ",round(PM,5)),file="diverstest.out",append=T) write(paste("Probability a Group 2 pairwise distance <= Pooled median for both groups = ",round(phat,5)),file="diverstest.out",append=T) write(paste("Variance in denominator of Tpoolmedian = ",round(sigma22hatmed,5)),file="diverstest.out",append=T) write(paste("Estimated correlation Group 2 = ",round(sqrt(sigma21hatmed/sigma22hatmed),5)),file="diverstest.out",append=T) write(paste(""),file="diverstest.out",append=T) write(paste("Test statistic Tsubjmn for t-test of subject-specific averages = ",round(Tsubjmean,5)),file="diverstest.out",append=T) write(paste("2-sided p-value for t-test of subject-specific averages = ",round(pvalueTsubjmean,5)),file="diverstest.out",append=T) write(paste("Average diversity Group 1, subject-specific averages = ",round(mean(mu1hatvect),5)),file="diverstest.out",append=T) write(paste("Estimated variance of mean average diversities Group 1 = ",round((piece1/(M1^2)),5)),file="diverstest.out",append=T) write(paste("Average diversity Group 2, subject-specific averages = ",round(mean(mu2hatvect),5)),file="diverstest.out",append=T) write(paste("Estimated variance of mean average diversities Group 2 = ",round((piece2/(M2^2)),5)),file="diverstest.out",append=T) write(paste("Difference in mean diversities (Group 1 - 2) = ",round(mean(mu1hatvect)-mean(mu2hatvect),5)),file="diverstest.out",append=T) write(paste("95% CI for difference = ",round(mean(mu1hatvect)-mean(mu2hatvect)-1.96*sqrt((piece1/(M1^2))+(piece2/(M2^2))),5),",",round(mean(mu1hatvect)-mean(mu2hatvect)+1.96*sqrt((piece1/(M1^2))+(piece2/(M2^2))),5)),file="diverstest.out",append=T) return(c(pvalueTpoolmean,pvalueTpoolmedian,pvalueTsubjmean,round(Tpoolmean,5),round(Tpoolmedian,5),round(Tsubjmean,5))) }