# Workshop "Bayesian methods to estimate species divergence times" # July 2015 # Mario dos Reis, Ziheng Yang and Phil Donoghue # A Bayesian phylogenetics inference program rm(list=ls()) # ########################################## # (A) Simple example, calculated analytically # ########################################## # The data are a pairwise sequence alignment of the 12s RNA gene sequences # from human and orangutan. # * The alignment is n = 948 nucleotides long, with x = 90 differences between sequences. # (Example 7.1 p.216, Yang 2014 Molecular Evolution: A Statistical Approach) # Our aim is to estimate the molecular distance d, in substitutions per site, # between the two sequences. Because the two species are closely related (i.e., # the alignment has less than 10% divergence) we use the Jukes and Cantor (1969) # nucleotide substitution model (JC69). # The Bayesian estimate of d, is given by the posterior distribution: # p(d | x) = C * p(d) * p(x | d). # p(d) is the prior on the distance. # p(x | d) is the (Jukes-Cantor) likelihood, i.e. the probability of observing # the data x given d, and C is a normalising constant, C = 1 / ∫ p(d) p(x | d) dd # likelihood under JC69 model, i.e. L = p(x | d): L <- function(d, x, n) (3/4 - 3*exp(-4*d/3)/4)^x * (1/4 + 3*exp(-4*d/3)/4)^(n - x) # plot the likelihood: curve(L(x, 90, 948), from=0, to=0.2, n=500, xlab="distance, d", ylab="likelihood, p(x | d)") # The maximum likelihood estimate is (the highest point in the curve) d.mle <- -3/4 * log(1 - 4/3 * 90/948) # [1] 0.1015060 abline(v=d.mle, lty=2) title(expression(hat(d) == 0.1015)) # For the prior on d, we use an exponential distribution with mean = 0.2 # p(d) = exp(-d/mu) / mu; mu = 0.2. d.prior <- function(d, mu) exp(-d/mu) / mu # un-normalised posterior: d.upost <- function(d, x, n, mu) d.prior(d, mu=mu) * L(d, x=x, n=n) # To calculate the normalising constant C, we need to integrate the un-normalised posterior: integrate(d.upost, lower=0, upper=Inf, x=90, n=948, mu=0.2, abs.tol=0) # 5.167762e-131 with absolute error < 1.9e-135 C <- 1 / 5.167762e-131 # The posterior distribution function (when x=90, n=948 and mu=0.2)is thus d.post <- function(d) C * d.upost(d=d, x=90, n=948, mu=0.2) # Plot posterior and add prior: curve(d.post(x), from=0, to=0.2, n=500, xlab="d", ylab="density") curve(d.prior(x, mu=0.2), add=TRUE, lty=2) # EXERCISE 1: Try adding the likelihood curve to the plots above. # HINT: You may want to scale the likelihood by a constant like C * 3 # The posterior mean of d is found by integrating ∫ d * p(d | x) dd f <- function(d) d * d.post(d) integrate(f, lower=0, upper=Inf, abs.tol=0) # 0.1021246 with absolute error < 5.5e-06 # Add it to the plot: abline(v=0.1021246, lty=3) # posterior mean # Add MLE: abline(v=d.mle, lty=3, col="red") # The posterior distribution can be used to answer questions such as # what is the probability that d > 0.1? integrate(d.post, lower=0.1, upper=Inf) # 0.5632372 with absolute error < 1.6e-08, i.e. 56.3% # The 95% equal-tail credibility interval is (0.08191, 0.12463) integrate(d.post, lower=0, upper=0.08191) # 2.5% integrate(d.post, lower=0, upper=0.12463) # 97.5% # EXERCISE 2 (VERY HARD): Can you work out how to calculate the 99% CI? # HINT: You need to find the upper bounds where the integral is 0.005 # and 0.995 respectively. The integrate and optim functions in R may be useful. # ################################################################### # (B) The same example analysed using Markov chain Monte Carlo (MCMC) # ################################################################### # We now obtain the posterior distribution by MCMC sampling. # In most practical problems, constant C cannot be calculated (either # analytically or numerically), and so MCMC becomes necessary. #To avoid numerical problems, we want to calculate the likelihood, prior and posterior # on the log scale, so we define and use the following functions lnL <- function(d, x, n) x*log(3/4 - 3*exp(-4*d/3)/4) + (n - x) *log(1/4 + 3*exp(-4*d/3)/4) d.lnprior <- function(d, mu) -d/mu - log(mu) # the log(mu) term is constant & can be deleted d.lnpost <- function(d, x, n, mu) d.lnprior(d, mu=mu) + lnL(d, x=x, n=n) # unnormalized # Draft MCMC algorithm: # 1. Set initial state for d. # 2. Propose a new state d* (from an appropriate proposal density). # 3. Accept or reject the proposal with probability # min(1, p(d*)p(x|d*) / p(d)p(x|d)) # If the proposal is accepted set d=d*, otherwise d=d. # 4. Save d. # 5. Go to step 2. d.mcmcf <- function(init.d, N, w) { # init.d is the initial state # N is the number of 'generations' the algorithm is run for. # w is the 'width' of the proposal density. d <- numeric(N+1) # Here we will keep the visited values of d. d[1] <- init.d dnow <- init.d acc.prop <- 0 # acceptance proportion for (i in 1:N) { # here we use a uniform density with reflection to propose a new d* dnew <- dnow + runif(1, -w/2, w/2) #dnew <- dnow + rnorm(1, 0, w) # a normal proposal, using w as the SD. if(dnew < 0) dnew <- -dnew lnalpha <- d.lnpost(dnew, x=90, n=948, mu=0.2) - d.lnpost(dnow, x=90, n=948, mu=0.2) # if ru < alpha accept proposal: if (lnalpha>0 || runif(1, 0, 1) < exp(lnalpha)) { dnow <- dnew; acc.prop <- acc.prop + 1 } # else reject it, so that dnow is dnow. d[i+1] <- dnow # Can you think of a way to print the progress of the MCMC to the screen? } # print out the proportion of times the proposal was accepted print(c("Acceptance proportion:", acc.prop/N)) return (d) # return vector of d's visited during MCMC } # Test the algorithm d.1 <- d.mcmcf(0.5, 100, 0.08) # plot output: plot(d.1, type='l', xlab="generation", ylab="d") # This is the 'trace' plot of d. # the chain wiggles about spending time at different values of d in # proportion to their posterior density. # Add 95% CI as reference: abline(h=c(0.08191, 0.12463), lty=2, col="red") # It may be useful to caculate the time it takes to run x generations: system.time(d.mcmcf(0.5, 10000, 0.08)) # ~ 0.6s in my laptop, that means 1e6 would take ~60s or 1min. # EXERCISE 2a. Note that each MCMC iteration calls the function for calculating the # log posterior twice, for dnow and dnew. This wastes computation since that for # dnow was calculated in the previous iteration. Modify the code (by recording the log # posterior for dnow so that you only need to calculate the value for dnew in every # iteration). Run long chains to see whether this reduces the running time. # Typically the C code runs at least 100 times faster than the R code for the same algorithm. # Get a very long sample: d.long <- d.mcmcf(0.5, 10000, 0.08) plot(d.long, type='l') # The trace plot looks like a dense, hairy caterpillar! # This is good! # Remove initial state and burn-in d.long <- d.long[-(1:101)] hist(d.long, prob=TRUE, n=50, xlab="d", main="") # The sample histogram from the MCMC curve(d.post(x), add=TRUE, col="red", lwd=2) # The true posterior distribution. # Alternatively: plot(density(d.long, adj=1.5)); rug(d.long) curve(d.post(x), add=TRUE, col="red", lwd=2) # EXERCISE 3: What does adj in the density function do? Try different values # such as adj=0.1 or adj=1.0. # How long (how many generations, N) do you need to run the MCMC so that # the density plot is almost identical to the true distribution? # Now we can get the posterior mean, and 95% CI from the sample: mean(d.long) # should be close to 0.1021246 quantile(d.long, prob=c(0.025, 0.975)) # should be close to (0.08191, 0.12463) # Effect of step size, w: d.1 <- d.mcmcf(0.5, 100, w=0.08) # moderate step size plot(d.1, type='b', col="black", pch=19, cex=.5) # chain moves about d.2 <- d.mcmcf(0.5, 100, w=0.01) # tiny step size lines(d.2, type='b', col="green", pch=19, cex=.5) # baby steps, chain moves slowly d.3 <- d.mcmcf(0.5, 100, w=1) # large step size lines(d.3, type='b', col="blue", pch=19, cex=.5) # chain gets stuck easily # EXERCISE 4: The optimal acceptance proportion is about 40%. Try various step sizes, w, # and see the effect on the acceptance proportion. What is the optimal w? # Finding the best step size is called fine-tuning. ################################## ## (B') Behavior in the long tails ################################## ### If init.d is very large, lnL is nearly constant, and d is reduced linearly. ### Acceptance proportion and change in d can be calculated analytically. ### Left moves are all accepted. Right moves are accepted with prob to be function of \Delta d. w=0.5; d0=10000; N=1e4; mu=0.2; 0.5 + mu*1/w*(1-exp(-1/mu*w/2)) ### acceptance proportion d0 + (-w/8 + 1/w*(mu*mu - mu*mu*exp(-1/mu*w/2)*(1/mu*w/2+1)))*N ### d_N d.1 <- d.mcmcf(d0, N, w) plot(d.1, type='l', xlab="generation", ylab="d") # This is the 'trace' plot of d. d.1[N] # ################################################################### # (C) Autocorrelation, efficiency, effective sample size and thinning # ################################################################### # States in a MCMC sample are autocorrelated, because each new state is # either the previous state or a modification of it. x <- d.long[-length(d.long)] # remove the last state y <- d.long[-1] # remove the first state plot(x, y, xlab="current state", ylab="next state", main="lag, k=1") cor(x, y) # this is the autocorrelation for the d sample for lag 1, ~50-60% # We can use the acf function to calculate the autocorrelation for various lags: d.acf <- acf(d.long) # The correlation is lost after a lag of k=10 d.acf$acf # the actual acf values are here. # If the autocorrelation is high, the chain will be inefficient, # i.e. we will need to run the chain for a long time to obtain a good # approximation to the posterior distribution. # The efficiency of a chain is defined as: # eff = 1 / (1 + 2(r1 + r2 + r3 + ...)) # where r_k is the correlation for lag k. eff <- function(acf) 1 / (1 + 2 * sum(acf$acf[-1])) eff(d.acf) # ~ 30% What does that mean? # Efficiency is related to the effective sample size (ESS) by: ESS = N * eff N <- length(d.long) # 9,900 length(d.long) * eff(d.acf) # ~ 2,700 # That is, our MCMC sample of size 9,900 is as informative about d as an # independent sample of size ~2,700. In other words, 30% efficiency means # the estimate (of the posterior mean) from an MCMC sample of size N has the # same variance as the estimate from an independent sample of the size N*0.3. # EXERCISE 5: Run a long chain (say N=1e4) with step size w=0.01. # Calculate the efficiency and the ESS. # Do it at home: Try different values of w, and plot eff against w. # What value of w gives the highest eff? # Usually, to save hard drive space, the MCMC is 'thinned', i.e. rather than # saving every single value sampled, we only save every m-th value. The resulting # sample will be smaller (thus saves hard drive space) but still has nearly as much info # as the total sample. Usually we modify the MCMC code to do this. # Here we may reduce the sample to ~1/4 the original size: d.reduced <- d.long[seq(from=1, to=N, by=4)] mean(d.reduced) # still close to 0.1021246 quantile(d.reduced, c(0.025, 0.975)) # still close to (0.08191, 0.12463) # ############################################################### # (D) MCMC diagnostics. How do we know our results are reliable? # ############################################################### # (1) Run the chain from different starting values, the chains # should converge to the same region in parameter space: d.high <- d.mcmcf(0.4, 1e2, 0.1) plot(d.high, col="red", ty='l', ylim=c(0, 0.4)) d.middle <- d.mcmcf(0.1, 1e2, 0.1) lines(d.middle, col="black", ty='l') d.low <- d.mcmcf(0, 1e2, 0.1) lines(d.low, col="blue", ty='l') abline(h=c(0.08191, 0.12463), lty=2) # (2) Compare the histograms ,and summary statistics obtained # from several long, independent runs: d.long1 <- d.mcmcf(0.4, 1e3, 0.1)[-(1:101)] d.long2 <- d.mcmcf(0.0, 1e3, 0.1)[-(1:101)] plot(density(d.long1, adj=1), col="red") lines(density(d.long2, adj=1), col="blue") mean(d.long1); mean(d.long2) # relative difference abs(mean(d.long1) - mean(d.long2)) / mean(c(d.long1, d.long2)) * 100 # It is extremely importantthat you run all your MCMC analyses at least twice! # You cannot assess the robustness of a MCMC with only one run! # Try again using N=1e5 (will take a little to finish) d.long1 <- d.mcmcf(0.4, 1e5, 0.1)[-(1:101)] d.long2 <- d.mcmcf(0.4, 1e5, 0.1)[-(1:101)] # (3) Calculate ESS. Ideally, we should aim for ESS > 1,000, when # summary statistics such as the posterior mean and 95% CI can be # accurately estimated. ESS > 10,000 offers superb performance, but this # is rarely achieved in phylogenetic problems. ESS < 100 is poor: length(d.long1) * eff(acf(d.long1)) # > 22,000 (nice!) # (4) Plot running means: rm1 <- cumsum(d.long1) / (1:length(d.long1)) rm2 <- cumsum(d.long2) / (1:length(d.long2)) plot(rm1, ty='l', ylab="posterior mean", xlab="generation"); lines(rm2, col="red") # There are more diagnostics tests and statistics that can be applied, for # example the potential scale reduction statistic # (p. 242, Yang 2014 Molecular Evolution: A Statistical Approach). # R packages such as CODA have been written to perform MCMC # diagnostics, and they may prove quite useful. # ################################################# # (E) MCMC in 2 dimensions - Molecular clock dating # ################################################# # Now consider estimating the time of divergence between human and orangutan # using the data above, x=90, n=948. # The molecular distance is the product of geological time, t, and the # molecular rate, r: d = r * t. # Our JC69 log likelihood based on r and t is: lnL.rt <- function(r, t, x, n) lnL(d=r*t, x=x, n=n) # The log likelihood is now a 2D surface: r <- seq(from=2e-3, to=6e-3, len=50) t <- seq(from=10, to=40, len=50) # creates a table of all combinations of r and t values: rt <- expand.grid(r=r, t=t) z <- lnL.rt(r=rt$r, t=rt$t, x=90, n=948) # The log likelihood surface image(r, t, matrix(z, ncol=50), xlab="r (s/My)", ylab="t (Ma)") contour(r, t, matrix(z, ncol=50), nlevels=5, add=TRUE) # The log likelihood looks like a flat, curve mountanous ridge # The top of the ridge is located at d.mle = 0.1015060 = r * t curve(d.mle/x, add=TRUE, col="blue", lwd=2, lty=2) # The molecular data are informative about d only, but not about r or t # separately. We say that r and t are confounded parameters. Any combination # of r and t that satisfy 0.1015060 = r * t are ML solutions. # We can make a neat 3D plot of the log likelihood: persp(r, t, matrix(z, ncol=50), zlab="likelihood", theta=40, phi=40) # The Bayesian method provides a useful methodology to estimate t and r through # the use priors. # The fossil record suggests that the common ancestor of human-orang lived # 33.7-11.2 Ma (see Benton et al. 2009 in the Timetree of Life book) # Thus the prior on t is a normal distribution with mean 22.45 (the midpoint # of the fossil calibraiton) and sd 5.6, so that the 95% range is roughly # 33.7 to 11.2. t.lnprior <- function(t, mu=22.45, sd=5.6) -((t-mu)/sd)^2 # constant ignored curve(t.lnprior(x), from=5, to=40, n=5e2, xlab="t (Ma)", ylab="density") # For the rate, r, we use an exponential distribution with mean # mu = 0.10 / 22.45 = 0.00445 (based on the distance MLE and the midpoint # of the calibration) r.lnprior <- function(r, mu=0.10/22.45) -r/mu - log(mu) curve(r.lnprior(x), from=0, to=.02, xlab="r (s/My)", ylab="density") # We can plot the log joint prior: z1 <- r.lnprior(rt$r) + t.lnprior(rt$t) image(r, t, matrix(z1, ncol=50), xlab="r (s/My)", ylab="t (Ma)") contour(r, t, matrix(z1, ncol=50), nlevels=5, add=TRUE) # The log unnormalized posterior is thus rt.lnpost <- function(r, t, x=90, n=948) r.lnprior(r) + t.lnprior(t) + lnL.rt(r, t, x, n) # We can plot the un-normalised posterior: z2 <- rt.upost(rt$r, rt$t) image(r, t, matrix(z2, ncol=50), xlab="r (s/My)", ylab="t (Ma)") contour(r, t, matrix(z2, ncol=50), nlevels=5, add=TRUE) curve(d.mle/x, add=TRUE, col="blue", lty=2, lwd=2) # EXERCISE 6: Can you plot the un-normalise posterior as a 3D surface using persp? # We are now ready to build a two-parameter MCMC! # The draft algorithm is as follows: # 1. Set initial states for t and r. # 2a. Propose a new state t* (from a uniform sliding window of width wt, for example). # 2b. Accept or reject the proposal with probability # min(1, p(r)p(t*)p(x|d*) / p(r)p(t)p(x|d)) # If the proposal is accepted set t=t*, otherwise t=t # 3a. Propose a new state r* (from a uniform sliding window of width wr, for example). # 3b. Accept or reject the proposal with probability # min(1, p(r*)p(t)p(x|d*) / p(r)p(t)p(x|d)) # If the proposal is accepted set r=r*, otherwise r=r # 5. Save r and t. # 6. Go to step 2. # EXERCISE 7: Create a function called rt.mcmcf based on the d.mcmcf function. # Run your newly written phylogenetics MCMC program! You will need two step sizes, # wt and wr. Vary the step sizes and check the performance of the # MCMC chain. Run the analysis several times, and calculate posterior means and # 95% CIs for r and t. Calculate efficiency and ess. A 3D density histogram of the # MCMC posterior can be obtained using the kde2d in the MASS package. # EXERCISE 8: Try using a normal proposal density (rather than uniform) with reflection # for r and t. Does it make a difference? # EXERCISE (VERY HARD) 9: The normalising constant in this case is a double integral # C = 1 / ∫∫ p(r) p(t) p(x | d) dr dt # Can you calculate C using R's cubature package (the normal integrate function # does not work, you need special multidimensional integration packages)? # The cubature package is not a default package, so you need to use install.packages # to get it.