Title: | Graph-Based k-Sample Comparisons and Relevance Analysis in High Dimensions |
---|---|
Description: | We propose two distribution-free test statistics based on between-sample edge counts and measure the degree of relevance by standardized counts. Users can set edge costs in the graph to compare the parameters of the distributions. Methods for comparing distributions are as described in: Xiaoping Shi (2021) <arXiv:2107.00728>. |
Authors: | Xiaoping Shi [aut, cre] |
Maintainer: | Xiaoping Shi <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.0 |
Built: | 2024-11-16 03:33:09 UTC |
Source: | https://github.com/cran/GRelevance |
Given the groups and the shortest Hamiltonian path, this function returns the number of edges that connect nodes between samples.
compbypath(G,re.path)
compbypath(G,re.path)
G |
a list of all groups |
re.path |
the shortest Hamiltonian path returned from the function Hpath |
the number of edges that connect nodes between samples
Hpath
d=100;n1=20;n2=30;n3=40; N=n1+n2+n3 mu1=rep(0,d) mu2=mu1 mu3=mu2+0.1 cov1=0.2^(abs(outer(1:d,1:d,"-"))) cov2=0.2^(abs(outer(1:d,1:d,"-"))) cov3=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=cov2) sam3=MASS::mvrnorm(n=n3,mu=mu3,Sigma=cov3) Data=rbind(sam1,sam2,sam3) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA G=list() G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2));G[[3]]=c((n1+n2+1):(n1+n2+n3)); compbypath(G,Hpath(1,N,Dist))
d=100;n1=20;n2=30;n3=40; N=n1+n2+n3 mu1=rep(0,d) mu2=mu1 mu3=mu2+0.1 cov1=0.2^(abs(outer(1:d,1:d,"-"))) cov2=0.2^(abs(outer(1:d,1:d,"-"))) cov3=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=cov2) sam3=MASS::mvrnorm(n=n3,mu=mu3,Sigma=cov3) Data=rbind(sam1,sam2,sam3) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA G=list() G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2));G[[3]]=c((n1+n2+1):(n1+n2+n3)); compbypath(G,Hpath(1,N,Dist))
Applies the path.kruskal function based on the nodes and edge.cost (sorts the weights from minimum to maximum). Given the starting node, ending node, and the distance matrix, this function returns the list of nodes of each edge from the shortest Hamiltonian path. We have the Hamiltonian path from path.kruskal
Hpath(n1,n2,mat)
Hpath(n1,n2,mat)
n1 |
starting node |
n2 |
ending node |
mat |
distance matrix (distance type is determined by the reader) |
list of nodes of each edge from the shortest Hamiltonian path
path.kruskal
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA Hpath(1,N,Dist)
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA Hpath(1,N,Dist)
Given the groups and the observed statistic, this function returns the pvalue.
Mpermut(G,W,obs)
Mpermut(G,W,obs)
G |
a list of all groups |
W |
the weight matrix |
obs |
the observed statistic |
the pvalue
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA counts=compbypath(G,Hpath(1,N,Dist)) W=Weight(G) #W[i,j]=0 #if we donot consider this relevance between sample i and sample j C=counts$EC Z=(C-W$mean)*W$weight obs=min(Z[!is.na(Z)]) Mpermut(G,W$weight,obs)
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA counts=compbypath(G,Hpath(1,N,Dist)) W=Weight(G) #W[i,j]=0 #if we donot consider this relevance between sample i and sample j C=counts$EC Z=(C-W$mean)*W$weight obs=min(Z[!is.na(Z)]) Mpermut(G,W$weight,obs)
Calculates the shortest Hamiltonian path based on the sorted edge weights and the nodes
path.kruskal(nodes,edge_cost)
path.kruskal(nodes,edge_cost)
nodes |
sequence of nodes 1,...,n from the graph which is based on the high-dimensional data that is provided by the reader |
edge_cost |
sorted edge weights |
the shortest Hamiltonian path
Hpath
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA mat=Dist n1=1; n2=N; n0=n2-n1+1 edge.cost=matrix(NA,nrow=n0*(n0-1)/2,ncol=3) temp=1; for(i in n1:(n2-1)) for(j in (i+1):(n2)) { edge.cost[temp,3]=mat[i,j];edge.cost[temp,1]=i-n1+1;edge.cost[temp,2]=j-n1+1;temp=temp+1;} edge.cost=edge.cost[sort.list(edge.cost[,3]), ] path.kruskal(c(1:n0),edge.cost)
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA mat=Dist n1=1; n2=N; n0=n2-n1+1 edge.cost=matrix(NA,nrow=n0*(n0-1)/2,ncol=3) temp=1; for(i in n1:(n2-1)) for(j in (i+1):(n2)) { edge.cost[temp,3]=mat[i,j];edge.cost[temp,1]=i-n1+1;edge.cost[temp,2]=j-n1+1;temp=temp+1;} edge.cost=edge.cost[sort.list(edge.cost[,3]), ] path.kruskal(c(1:n0),edge.cost)
Given the sampless, this function returns the mean and weight matrix.
Weight(G)
Weight(G)
G |
a list of all groups |
the mean and weight matrix
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); Weight(G)
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); Weight(G)
Given the groups, the weight matrix and the observed statistic, this function returns the pvalue.
Wpermut(G,W,obs)
Wpermut(G,W,obs)
G |
a list of all groups |
W |
the weight matrix |
obs |
the observed statistic |
the pvalue
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA counts=compbypath(G,Hpath(1,N,Dist)) W=Weight(G) #W[i,j]=0 #if we donot consider this relevance between sample i and sample j C=counts$EC WC=W$weight*C WS=sum(WC[!is.na(WC)]) Wpermut(G,W$weight,WS)
G=list() set.seed(1) n1=20;n2=40 N=n1+n2; G[[1]]=c(1:n1);G[[2]]=c((n1+1):(n1+n2)); d=10 mu1=rep(0,d) mu2=mu1+0.1 true.cov1=0.4^(abs(outer(1:d,1:d,"-"))) true.cov2=0.4^(abs(outer(1:d,1:d,"-"))) sam1=MASS::mvrnorm(n=n1,mu=mu1,Sigma=true.cov1) sam2=MASS::mvrnorm(n=n2,mu=mu2,Sigma=true.cov2) Data=rbind(sam1,sam2) Dist=philentropy::distance(Data, method = "euclidean") Dist[lower.tri(Dist)] <- NA Dist[diag(Dist)] <- NA counts=compbypath(G,Hpath(1,N,Dist)) W=Weight(G) #W[i,j]=0 #if we donot consider this relevance between sample i and sample j C=counts$EC WC=W$weight*C WS=sum(WC[!is.na(WC)]) Wpermut(G,W$weight,WS)