# REPLICATION CODE FOR: Lupu, Noam, Lucia Selios, and Zach Warner. 2016. "A
# New Measure of Congruence: The Earth Mover's Distance." Political Analysis
# (forthcoming).
#
# Last edited 10-Nov-2016 by ZW. For questions please contact me at
# zwarner@wisc.edu.

################################################################################
#                                                                              #
#                                 SETUP                                        #
#                                                                              #
################################################################################
rm(list=ls())
library(ggplot2); library(grid); library(sfsmisc); library(emdist)
library(clusterGeneration); library(foreign); library(plyr); library(sandwich) 
library(lmtest); library(reshape)
setwd("/your/path/to/this/replication")

### version control
sessionInfo()
# R version 3.3.0 (2016-05-03)
# Platform: x86_64-apple-darwin13.4.0 (64-bit)
# Running under: OS X 10.11.6 (El Capitan)
# 
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
# [1] grid      stats     graphics  grDevices utils     datasets  methods  
# [8] base     
# 
# other attached packages:
# [1] reshape_0.8.6           lmtest_0.9-34           zoo_1.7-13             
# [4] sandwich_2.3-4          plyr_1.8.4              foreign_0.8-67         
# [7] clusterGeneration_1.3.4 MASS_7.3-45             emdist_0.3-2           
# [10] sfsmisc_1.1-0           ggplot2_2.1.0          
# 
# loaded via a namespace (and not attached):
# [1] Rcpp_0.12.7      lattice_0.20-34  gtable_0.2.0     scales_0.4.0    
# [5] tools_3.3.0      munsell_0.4.3    rsconnect_0.5    colorspace_1.2-7  

################################################################################
#                                                                              #
#                                 UTILITIES                                    #
#                                                                              #
################################################################################

##### Helper functions #####
# for multiple ggplot objects
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  if (numPlots==1) {
    print(plots[[1]])
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
# to simulate from a bimodal distribution
bimodalDistFunc <- function (n,cpct, mu1, mu2, sig1, sig2) {
  y0 <- rnorm(n,mean=mu1, sd = sig1)
  y1 <- rnorm(n,mean=mu2, sd = sig2)
  
  flag <- rbinom(n,size=1,prob=cpct)
  y <- y0*(1 - flag) + y1*flag 
}
# R^2 approximation from glm objects
r2.corr.mer <- function(m) {
  lmfit <-  lm(model.response(model.frame(m)) ~ fitted(m))
  summary(lmfit)$r.squared
} 
# for clustered SEs
cl <- function(dat,fm, cluster){
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- fm$rank
  dfc <- (M/(M-1))*((N-1)/(N-K))
  uj  <- apply(estfun(fm),2, function(x) tapply(x, cluster, sum));
  vcovCL <- dfc*sandwich(fm, meat=crossprod(uj)/N)
  coeftest(fm, vcovCL) 
}

##### Helper functions for computing congruence #####
# Difference in means
dmeans <- function(data){
  df <- data
  x <- as.matrix(df$samps[which(df$pop == "x")])
  y <- as.matrix(df$samps[which(df$pop == "y")])
  abs(mean(x,na.rm=T)-mean(y,na.rm=T))
}
# CDF overlap
cdf.overlap <- function(data, q = 512) { 
  # no default length for sequences, but we'll use density's 512
  df <- data
  x <- as.matrix(na.omit(df$samps[which(df$pop == "x")]))
  y <- as.matrix(na.omit(df$samps[which(df$pop == "y")]))
  Fx <- ecdf(x); Fy <- ecdf(y)
  lower <- min(x,y) 
  upper <- max(x,y)
  z <- seq(lower,upper,((upper-lower)/q))
  Fxz <- Fx(z); Fyz <- Fy(z) 
  sum(abs(Fxz-Fyz))
}
# PDF overlap
pdf.overlap <- function(data, q = 512){ # q = 512 is the default for density()
  df <- data
  fx <- as.vector(na.omit(df$samps[which(df$pop == "x")]))
  fy <- as.vector(na.omit(df$samps[which(df$pop == "y")]))
  lower <- min(c(fx, fy))
  upper <- max(c(fx, fy))
  dx <- density(fx, from=lower, to=upper, n = q)
  dy <- density(fy, from=lower, to=upper, n = q)
  d <- data.frame(location=dx$x, x.den=dx$y, y.den=dy$y)
  d$intersect <- pmin(d$x.den, d$y.den)
  integrate.xy(d$location, d$intersect)
}
# EMD
emd.dis <- function(data, metric = "manhattan", iterations=100000){
  df <- data
  x <- as.matrix(df$samps[which(df$pop == "x")])
  y <- as.matrix(df$samps[which(df$pop == "y")])
  weight.x <- rep(1/nrow(x),nrow(x))
  weight.y <- rep(1/nrow(y),nrow(y))
  emdw(x,weight.x,y,weight.y,dist=metric,max.iter=iterations)
}

################################################################################
#                                                                              #
#                               INTRODUCTION                                   #
#                                                                              #
################################################################################

##### SIMULATED COMPUTATION TIME (FN 3) #####
# generate the data
x <- genRandomClust(numClust=7,numNonNoisy = 10, numNoisy = 5, numOutlier = .02,
                    rangeN=c(150,500), numReplicate = 1,fileName="x", 
                    outputLogFlag=F, outputEmpirical=F, outputInfo = F, 
                    outputDatFlag = F)
y <- genRandomClust(numClust=3,numNonNoisy = 10, numNoisy = 5, numOutlier = .08, 
                    rangeN=c(150,500), numReplicate = 1,fileName="y", 
                    outputLogFlag=F, outputEmpirical=F, outputInfo = F, 
                    outputDatFlag = F)
x <- x$datList$x_1
y <- y$datList$y_1
w.x <- rep(1/nrow(x),nrow(x))
w.y <- rep(1/nrow(y),nrow(y))
# time it
system.time(
  emd.bigprob <- emdw(x,w.x,y,w.y,max.iter=100000,dist="manhattan")
)
# Results for my machine (mid-2012 MacBook Air, 2 GHz Intel Core i7 processor, 
# 8 GB 1600 MHz DDR3 memory)
# user      system    elapsed 
# 500.493   4.536     511.266 

### clean up
rm(list=ls()[!(ls() %in% c("dmeans","cdf.overlap","pdf.overlap","emd.dis",
                           "r2.corr.mer","multiplot","bimodalDistFunc","cl"))])

################################################################################
#                                                                              #
#                            MEASURING CONGRUENCE                              #
#                                                                              #
################################################################################

### Create the data
set.seed(1234567)
dat.x.y1 <- data.frame(samps = c(rnorm(1000, mean=0, sd=1), 
                                 rnorm(1000, mean=3, sd=1)), 
                       pop = rep(c("x", "y"), each = 1000))
dat.x.y3 <- dat.x.y2 <- dat.x.y1
dat.x.y2$samps[which(dat.x.y2$pop == "y")] <- rnorm(1000,0,8)
dat.x.y3$samps[which(dat.x.y3$pop == "y")] <- bimodalDistFunc(1000,.35,2.5,
                                                              13.1,1,2)

##### FIGURE 1 #####
fig1.1 <- ggplot(dat.x.y1, aes(x = samps, fill = pop)) +
  theme_bw(base_size = 16) +
  theme(panel.grid.minor = element_blank(), axis.text.x = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_histogram(aes(y = ..density.. ), alpha = 0.5,pos="identity",binwidth=1,
                 show.legend=F) +
  geom_line(aes(y = ..density..), stat = 'density') + 
  scale_fill_manual(values=c("grey20", "grey20")) +
  coord_cartesian(ylim=c(-.01,.44),xlim=c(-11,26)) +
  scale_x_continuous(breaks=seq(-10,25,by=5)) +
  scale_y_continuous(breaks=seq(0,.40,by=.1)) +
  annotate("text",x=-3,y=.20,label="X",family="Times",size=5) +
  annotate("text",x=6.2,y=.10,label="Y[1]",family="Times", parse=T,size=5) +
  labs(title=NULL,x=NULL,y="Density") +
  theme(text=element_text(family="Times"))
fig1.1
fig1.2 <- ggplot(dat.x.y2, aes(x = samps, fill = pop)) +
  theme_bw(base_size = 16) +
  theme(panel.grid.minor = element_blank(), axis.text.x = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_histogram(aes(y = ..density.. ), alpha = 0.5,pos="identity",binwidth=1,
                 show.legend=F) +
  geom_line(aes(y = ..density..), stat = 'density') + 
  scale_fill_manual(values=c("grey20", "grey20")) +
  coord_cartesian(ylim=c(-.01,.44),xlim=c(-11,26)) +
  scale_x_continuous(breaks=seq(-10,25,by=5)) +
  scale_y_continuous(breaks=seq(0,.40,by=.1)) +
  annotate("text",x=-3,y=.20,label="X",family="Times",size=5) +
  annotate("text",x=6.2,y=.10,label="Y[2]",family="Times", parse=T,size=5) +
  labs(title=NULL,x=NULL,y="Density") +
  theme(text=element_text(family="Times"))
fig1.2
fig1.3 <- ggplot(dat.x.y3, aes(x = samps, fill = pop)) +
  theme_bw(base_size = 16) +
  theme(panel.grid.minor = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid")) +
  geom_histogram(aes(y = ..density.. ), alpha = 0.5,pos="identity",binwidth=1,
                 show.legend=F) +
  geom_line(aes(y = ..density..), stat = 'density') + 
  scale_fill_manual(values=c("grey20", "grey20")) +
  coord_cartesian(ylim=c(-.01,.44),xlim=c(-11,26)) +
  scale_x_continuous(breaks=seq(-10,25,by=5)) +
  scale_y_continuous(breaks=seq(0,.40,by=.1)) +
  annotate("text",x=-3,y=.20,label="X",family="Times",size=5) +
  annotate("text",x=6.2,y=.10,label="Y[3]",family="Times", parse=T,size=5) +
  labs(title=NULL,x="z",y="Density") +
  theme(text=element_text(family="Times"))
fig1.3
pdf("figure1.pdf",
    width=6.5,height=9)
grid.newpage()
grid.draw(rbind(ggplotGrob(fig1.1), 
                ggplotGrob(fig1.2), 
                ggplotGrob(fig1.3),size="last"))
dev.off()

################################################################################
#                                                                              #
#                             SIMULATION EVIDENCE                              #
#                                                                              #
################################################################################

##### TABLE 1 #####
### Difference of means distance
o2o.1 <- dmeans(dat.x.y1)
o2o.2 <- dmeans(dat.x.y2)
o2o.3 <- dmeans(dat.x.y3)

### CDF overlap distance
cdf.1 <- cdf.overlap(dat.x.y1)
cdf.2 <- cdf.overlap(dat.x.y2)
cdf.3 <- cdf.overlap(dat.x.y3)

### PDF overlap distance
pdf.1 <- pdf.overlap(dat.x.y1)
pdf.2 <- pdf.overlap(dat.x.y2)
pdf.3 <- pdf.overlap(dat.x.y3)

### EMD
emd.1 <- emd.dis(dat.x.y1)
emd.2 <- emd.dis(dat.x.y2)
emd.3 <- emd.dis(dat.x.y3)

### print the table
tab <- matrix(c(o2o.1,o2o.2,o2o.3,
                cdf.1,cdf.2,cdf.3,
                pdf.1,pdf.2,pdf.3,
                emd.1,emd.2,emd.3),ncol=4)
dimnames(tab) <- list(c("Top","Middle","Bottom"),
                      c("Diff in Means","CDF overlap","PDF Overlap","EMD"))
round(tab, 2)

##### FIGURE 3 #####
### size of each sample to compare
sizedist = 100 
### number of samples to compare
numsamps = 1000 
### create target distribution, a standard-normal
target <- data.frame(samps =  rnorm(sizedist,0,1), 
                     pop = rep("x", sizedist))

### generate means
means <-rnorm(numsamps,10,10)
# create dataframes to store results
df.meanshift <- data.frame(matrix(ncol=0,nrow=numsamps))
df.meanshift$means <- means
df.meanshift$emd <- df.meanshift$pdf <- df.meanshift$cdf <- 
  df.meanshift$diffmeans <-rep(NA,numsamps)
samps.list <- lapply(1:length(means),  
                     function(i) data.frame(matrix(nrow=sizedist,ncol=0)))
samps.df.means <- data.frame(matrix(nrow=sizedist,ncol=numsamps))
# create samples from the random means, then calculate distances and store them
for(i in 1:numsamps){
  print(i)
  # sample from the simulated mean
  samps.list[[i]]$samps <- rnorm(n=sizedist, mean = means[i], sd = 1)
  samps.df.means[,i] <- samps.list[[i]]$samps
  samps.list[[i]]$pop <- rep("y",sizedist)
  # bind the target dist
  samps.list[[i]] <- rbind(samps.list[[i]],target)
  # calculate congruence in various ways
  df.meanshift$diffmeans[i] <- dmeans(samps.list[[i]])
  df.meanshift$cdf[i] <- cdf.overlap(samps.list[[i]])
  df.meanshift$pdf[i] <- pdf.overlap(samps.list[[i]])
  df.meanshift$emd[i] <- emd.dis(samps.list[[i]],iterations = 100000)
}

### generate variances
variances <-runif(numsamps,1,10)
# create dataframes to store results
df.variances <- data.frame(matrix(ncol=0,nrow=numsamps))
df.variances$variances <- variances
df.variances$emd <- df.variances$pdf <- df.variances$cdf <- 
  df.variances$diffmeans <-rep(NA,numsamps)
samps.list <- lapply(1:length(variances),  
                     function(i) data.frame(matrix(nrow=sizedist,ncol=0)))
samps.df.vars <- data.frame(matrix(nrow=sizedist,ncol=numsamps))
# create samples from the random means, then calculate distances and store them
for(i in 1:numsamps){
  print(i)
  # sample from the simulated mean
  samps.list[[i]]$samps <- rnorm(n=sizedist, mean = 0, sd = sqrt(variances[i]))
  samps.df.vars[,i] <- samps.list[[i]]$samps
  samps.list[[i]]$pop <- rep("y",sizedist)
  # bind the target dist
  samps.list[[i]] <- rbind(samps.list[[i]],target)
  # calculate congruence in various ways
  df.variances$diffmeans[i] <- dmeans(samps.list[[i]])
  df.variances$cdf[i] <- cdf.overlap(samps.list[[i]])
  df.variances$pdf[i] <- pdf.overlap(samps.list[[i]])
  df.variances$emd[i] <- emd.dis(samps.list[[i]],iterations = 100000)
}

### Get the means data into shape for plotting
# diff-means
d1 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$diffmeans)))
colnames(d1) <- c("x","y")
d1$x <- d1$x/max(d1$x); d1$y <- d1$y/max(d1$y)
d1$true.rank <-  NA; d1$true.rank[order(d1$x)] <- 1:nrow(d1)
d1$calc.rank <-  NA; d1$calc.rank[order(d1$y)] <- 1:nrow(d1)
# CDF overlap
d2 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$cdf)))
colnames(d2) <- c("x","y")
d2$x <- d2$x/max(d2$x); d2$y <- d2$y/max(d2$y)
d2$true.rank <-  NA; d2$true.rank[order(d2$x)] <- 1:nrow(d2)
d2$calc.rank <-  NA; d2$calc.rank[order(d2$y)] <- 1:nrow(d2)
# PDf overlap (slightly different)
d3 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$pdf)))
colnames(d3) <- c("x","y")
d3$x <- d3$x/max(d3$x)
# the next line inverts distance so closer to 0 means "more similar,"
# like other measures
d3$y <- 1/d3$y
d3$y <- d3$y/max(d3$y) # scales it
d3$true.rank <-  NA; d3$true.rank[order(d3$x)] <- 1:nrow(d3)
d3$calc.rank <-  NA; d3$calc.rank[order(d3$y)] <- 1:nrow(d3)
# EMD
d4 <- as.data.frame(cbind(abs(df.meanshift$means),abs(df.meanshift$emd)))
colnames(d4) <- c("x","y")
d4$x <- d4$x/max(d4$x); d4$y <- d4$y/max(d4$y)
d4$true.rank <-  NA; d4$true.rank[order(d4$x)] <- 1:nrow(d4)
d4$calc.rank <-  NA; d4$calc.rank[order(d4$y)] <- 1:nrow(d4)

### Get the variances data ready for plotting
# subtract 1 off variance to account for target variance = 1, as in text
# diff-means
d5 <- as.data.frame(cbind(abs(df.variances$variances-1)
                          ,abs(df.variances$diffmeans)))
colnames(d5) <- c("x","y")
d5$x <- d5$x/max(d5$x); d5$y <- d5$y/max(d5$y)
d5$true.rank <-  NA; d5$true.rank[order(d5$x)] <- 1:nrow(d5)
d5$calc.rank <-  NA; d5$calc.rank[order(d5$y)] <- 1:nrow(d5)
# CDF overlap
d6 <- as.data.frame(cbind(abs(df.variances$variances-1),abs(df.variances$cdf)))
colnames(d6) <- c("x","y")
d6$x <- d6$x/max(d6$x); d6$y <- d6$y/max(d6$y)
d6$true.rank <-  NA; d6$true.rank[order(d6$x)] <- 1:nrow(d6)
d6$calc.rank <-  NA; d6$calc.rank[order(d6$y)] <- 1:nrow(d6)
# PDf overlap (slightly different)
d7 <- as.data.frame(cbind(abs(df.variances$variances-1),abs(df.variances$pdf)))
colnames(d7) <- c("x","y")
d7$x <- d7$x/max(d7$x)
# the next line inverts distance so closer to 0 means "more similar,"
# like other measures
d7$y <- 1/d7$y 
d7$y <- d7$y/max(d7$y) # scales it
d7$true.rank <-  NA; d7$true.rank[order(d7$x)] <- 1:nrow(d7)
d7$calc.rank <-  NA; d7$calc.rank[order(d7$y)] <- 1:nrow(d7)
# EMD
d8 <- as.data.frame(cbind(abs(df.variances$variances-1),abs(df.variances$emd)))
colnames(d8) <- c("x","y")
d8$x <- d8$x/max(d8$x); d8$y <- d8$y/max(d8$y)
d8$true.rank <-  NA; d8$true.rank[order(d8$x)] <- 1:nrow(d8)
d8$calc.rank <-  NA; d8$calc.rank[order(d8$y)] <- 1:nrow(d8)

### plot
fig3.1 <- ggplot(d1) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="Difference in means",
       x="True mean rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.2 <- ggplot(d2) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="CDF Overlap",
       x="True mean rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.3 <- ggplot(d3) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="PDF Overlap",
       x="True mean rank",
       y="Calculated distance rank",size=16) +
  theme(text=element_text(family="Times"))

fig3.4 <- ggplot(d4) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="EMD",
       x="True mean rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.5 <- ggplot(d5) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="Difference in means",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.6 <- ggplot(d6) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="CDF Overlap",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.7 <- ggplot(d7) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="PDF Overlap",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

fig3.8 <- ggplot(d8) + 
  theme_bw(base_size = 20) +
  theme(panel.grid.minor = element_blank(),panel.grid.major = element_blank(),
        panel.border = element_rect(fill=NA,size=1,linetype="solid"),
        axis.text = element_text(size=12)) +
  geom_point(aes(x=true.rank,y=calc.rank)) +
  coord_cartesian(xlim=c(0,1000),ylim=c(0,1000)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray20") + 
  labs(title="EMD",
       x="True variance rank",
       y="Calculated distance rank",
       size=16) +
  theme(text=element_text(family="Times"))

pdf("figure3.pdf",width=16,height=8)
multiplot(fig3.1,fig3.5,fig3.2,fig3.6,fig3.3,fig3.7,fig3.4,fig3.8, cols=4)
dev.off()

##### TABLE 2 #####
### calculate RMSE for ordinality
rmse.ord <- vector()
for(i in 1:8){
  tmp <- eval(parse(text=paste("d",i,sep="")))
  tmp$sqerr <- (tmp$true.rank - tmp$calc.rank)^2
  rmse.ord[i] <- sqrt(sum(tmp$sqerr)/nrow(tmp))
}
### calculate RMSE for cardinality
rmse.card <- vector()
for(i in 1:8){
  tmp <- eval(parse(text=paste("d",i,sep="")))
  tmp$sqerr <- (tmp$x - tmp$y)^2
  rmse.card[i] <- sqrt(sum(tmp$sqerr)/nrow(tmp))
}

### print the table
rmses <- c(rmse.ord,rmse.card)
rmses <- matrix(rmses,ncol=4,byrow=F)
dimnames(rmses) <- list(c("Diff in Means","CDF overlap","PDF Overlap","EMD"),
                        c("Ord: Mean","Ord: Variance", "Card: Mean",
                          "Card: Variance"))
round(rmses, 2)

### clean up
rm(list=ls()[!(ls() %in% c("dmeans","cdf.overlap","pdf.overlap","emd.dis",
                           "r2.corr.mer","multiplot","bimodalDistFunc","cl",
                           "samps.df.means","samps.df.vars","d1","d2","d3",
                           "d4","d5","d6","d7","d8","target"))])