# 25.1.2010
# Simulation des Kigman-Koaleszenten mit Mutationen
# und des sich ergebenden Tajimas D

sim.freqspec.wachsend <- function(n=10, theta=2.0, beta=1.0) {
# n = Stichprobengroesse, theta = Mutationsparameter 
  
# times[i] ist die Zeit auf Niveau i, i=2,...,n
# (der Faulheit halber mit times[1]=0 ergaenzt, so dass keine
#  Indexverschiebung beim Zugriff noetig ist)

  v<- 0
  times <- numeric(n)
  for (i in n:2) {
    times[i] <- log(1+beta*rexp(1,rate=choose(i,2))*exp(-beta*v))/beta
    v <- v+times[i]
  }
  times[1]<-0

  # times <- c(0,sapply(2:n, function(j) rexp(1, rate=choose(j,2))))

merges <- matrix(0,ncol=2,nrow=n)

mutations <- matrix(0, ncol=n, nrow=n) 

# no.leaves.above[i,j] ist die Anzahl Blaetter oberhalb
# von Kante j auf dem i-ten Niveau
no.leaves.above <- matrix(0, ncol=n, nrow=n) 
no.leaves.above[n,] <- rep(1, times=n)

# am Ende von Niveau i verschmelzen Linien merges[i,1] und merges[i,2]
# (merges[i,1] < merges[i,2],
#  Interpretation: Linien mit Nummern >= merges[i,2] werden umnummeriert,
#  so dass ihre neue Nummer eins kleiner ist als ihre alte)

for (i in n:2) {
  merges[i,] <- sort(sample(i,size=2,replace=FALSE))
  no.leaves.above[i-1,1:(merges[i,2]-1)]<-no.leaves.above[i,1:(merges[i,2]-1)]
  no.leaves.above[i-1,merges[i,1]]<-no.leaves.above[i-1,merges[i,1]]+
    no.leaves.above[i,merges[i,2]]
  if (merges[i,2]<i)
    no.leaves.above[i-1,merges[i,2]:(i-1)]<-
      no.leaves.above[i,(merges[i,2]+1):i]
}

# mutations[i,j] ist die Anzahl Mutationen auf Kante j
# im i-ten Niveau
for (i in 2:n) {
  mutations[i,1:i] <- rpois(i, lambda=times[i]*theta/2)
}

# Gesamtanzahl Mutationen
Sn <- sum(mutations)

# Mutationshaeufigkeitsspektrum
freqspec <- array(0, dim=n)
for (i in 2:n) {
  for (j in 1:i) {
    freqspec[no.leaves.above[i,j]]<-
      freqspec[no.leaves.above[i,j]]+mutations[i,j]
  }
}

## # Matrix der Anzahl paarweiser Unterschiede zwischen den
## # Stichproben
## Diff <- matrix(0, ncol=n, nrow=n)

## for (i1 in 1:(n-1)) {
##   for (i2 in i1:n) {
##     anc1 <- i1; anc2 <- i2
##     d <- 0; t <- n
##     while(anc1 != anc2) {
##       d<-d+mutations[t,anc1]+mutations[t,anc2]
##       # Finde die entsprechenden Ahnen eine Generation frueher:
##       if (anc1==merges[t,2])
##         anc1<-merges[t,1]
##       else if (anc1>merges[t,2])
##         anc1<-anc1-1

##       if (anc2==merges[t,2])
##         anc2<-merges[t,1]
##       else if (anc2>merges[t,2])
##         anc2<-anc2-1

##       t<-t-1
##       #cat("t=",t," anc1=", anc1, " anc2=",anc2,"\n")
##    }
##     Diff[i1,i2]<-d
##     #cat("\n")
##   }
## }

## Tajimas.pi.contr <- sum(Diff)/choose(n,2)

Tajimas.pi <- sum(sapply(1:n, function(i) freqspec[i]*i*(n-i)))/choose(n,2)

hn <- sum(1/(1:(n-1))); gn <- sum(1/((1:(n-1))^2))

c1 <- (n+1)/(3*(n-1)) - 1/hn
c2 <- gn/(hn^2) + 2*(n^2+n+3)/(9*n*(n-1)) - (n+2)/(hn*n)

VSn <- c1*Sn/hn + c2*Sn*(Sn-1)/(hn^2+gn)

D <- (Tajimas.pi-Sn/hn)/sqrt(VSn)

list(times=times, merges=merges, mutations=mutations,
     Sn=Sn, #Diff=Diff,
     Tajimas.pi=Tajimas.pi,
     #Tajimas.pi.contr=Tajimas.pi.contr,
     VSn=VSn, D=D)
}


# Empirische Verteilung von Tajimas D unter Kingman-Koaleszent
# mit Populationswachstum:

n <- 20
theta <- 10.0
beta <- 2.0

wdh <- 500 # Anz. Simulationen

TD.values <- numeric(wdh)

for (r in 1:wdh) {
  TD.values[r] <- sim.freqspec.wachsend(n, theta, beta)$D
}

hist.title <- c("Histgramm der empir. Vert. von Tajimas D",
                "unter Koaleszent mit Populationswachstum",
                paste("Wachstumsrate beta=",beta,sep=""), 
                paste("Stichpr.gr. ",n,", theta=",theta,",",sep=""),
                paste("basierend auf",wdh,"Simulationen"))

hist(TD.values, main=hist.title, prob=TRUE,
     xlab="Tajimas D",
     sub=paste("empir. MW=",round(mean(TD.values),digits=3),
       "  empir. Var=", round(var(TD.values),digits=3),sep=""))