# Introduction to Bayesian modeling


# Binomial probability distribution.

# Probability (likelihood) of observing S survivors in N individuals given survival probability theta
S=8
N=10
theta=0.76
# help(dbinom)
dbinom(x=S,size=N,prob=theta)

# Use R to calculate the likelihood for a series (a vector) of values for S at once
allS=0:N
like=dbinom(x=allS,size=N,prob=theta)
data.frame(allS,like)
x11(height=5.5,width=7,xpos=1200,ypos=1)
# windows(height=5.5,width=7,xpos=1200,ypos=1)
plot(allS,like,pch=16,main="binomial likelihood")
sum(like)

# Or calculate the log-likelihood, which is usually the currency in likelihood models
loglike=dbinom(x=allS,size=N,prob=theta,log=TRUE)
# data.frame(allS,loglike)
x11(height=5.5,width=7,xpos=1200,ypos=500)
# windows(height=5.5,width=7,xpos=1200,ypos=500)
plot(allS,loglike,pch=16,main="binomial log-likelihood")

# Consider again the data, S=8 survivors of N=10 individuals, and what the likelihood of observing this
# would be given various theta.
theta=0.76
dbinom(x=S,size=N,prob=theta)
theta=0.74
dbinom(x=S,size=N,prob=theta)
theta=0.72
dbinom(x=S,size=N,prob=theta)
theta=0.78
dbinom(x=S,size=N,prob=theta)
theta=0.82
dbinom(x=S,size=N,prob=theta)

# Take advantage of the vectorization again. This is concept of the posterior distribution.
theta=seq(0,1,by=0.01)
postlike=dbinom(x=S,size=N,prob=theta)
dev.set(2)
plot(theta,postlike,pch=16,type="l",main="binomial posterior",ylab="prob")
abline(v=c(0.76,0.8,0.84))
dev.set(3)
logpostlike=dbinom(x=S,size=N,prob=theta,log=TRUE)
plot(theta,logpostlike,pch=16,type="l",main="binomial posterior (log-likelihood)",ylab="prob")
abline(v=c(0.76,0.8,0.84))

# Approximate integral of the posterior 'distribution' is easy using sum, remembering to multiply by
# the width of intervals in theta (0.01). The graph illustrates why this quick approximation to the integral
# works.
x11(height=8,width=11,xpos=700,ypos=1)
# windows(height=8,width=11,xpos=700,ypos=1)
plot(theta,postlike,pch=16,main="binomial posterior",ylab="prob")
abline(v=c(.595,.605))
abline(h=0.121,col="red")
sum(0.01*postlike)

# Dividing by the sum converts this to a true probability distribution (by assuring the integral is 1).
k=sum(0.01*postlike)
post=postlike/k
sum(0.01*post)

# Overlay the graphs of the adjusted posterior
dev.set(2)
lines(theta,post,type="l",col="red",main="adjusted binomial posterior",ylab="prob")
plot(theta,post,pch=16,type="l",col="red",main="adjusted binomial posterior",ylab="prob")
abline(v=c(0.76,0.8,0.84))
lines(theta,postlike)
dev.set(3)
logpost=logpostlike-log(k)
plot(theta,logpost,type="l",col="red",main="adjusted binomial posterior (log)",ylab="prob")
abline(v=c(0.76,0.8,0.84))
lines(theta,logpostlike)

# This probability distribution -- the posterior distribution -- is the beta distribution, with parameters
# S+1 and N-S+1
# help(dbeta)
betaprob=dbeta(theta,shape1=S+1,shape2=N-S+1)
dev.set(2)
points(theta,betaprob)
logbetaprob=dbeta(theta,shape1=S+1,shape2=N-S+1,log=TRUE)
dev.set(3)
points(theta,logbetaprob)

# Since the beta distribution describes the posterior distribution, and it's a true probability distribution,
# it gives all the Bayesian results.
# help(dbeta)
# Mean (this is the formula):
(S+1)/(N+2)

# 95% credible intervals are where cumulative probability reaches .025 and .975:
qbeta(p=c(0.025,0.975),shape1=S+1,shape2=N-S+1)
dev.set(2)
CI=qbeta(p=c(0.025,0.975),shape1=S+1,shape2=N-S+1)
plot(theta,betaprob,type="l",main="beta distribution (posterior)")
abline(v=CI,col="blue",lwd=2)

# Random draws from the posterior distribution:
rbeta(n=25,shape1=S+1,shape2=N-S+1)
dev.set(3)
randdraw=rbeta(n=10000,shape1=S+1,shape2=N-S+1)
hist(x=randdraw,breaks=theta)
quantile(x=randdraw,probs=c(0.025,0.975))
CIrand=quantile(x=randdraw,probs=c(0.025,0.975))
abline(v=CIrand,col="blue",lwd=2)


# Doing a random walk on the likelihood function as an alternative for making random draws from the posterior. This does not
# require a closed form for the posterior, rather relies on the fact that the likelihood function is a constant times the
# posterior.
x11(height=8,width=11,xpos=700,ypos=1)
# windows(height=8,width=11,xpos=700,ypos=1)
# Here's the likelihood function, as a function of theta. It's a constant times the posterior (lines 45-46)
plot(theta,postlike,type="l",main="binomial pseudo-posterior",ylab="prob")

# Create a Markov chain of values theta. See functions in RfuncBayesWorkshop.r.
showBinomialMCMC(thetastart=.2,S=8,N=10)
showBinomialMCMC(thetastart=.2,S=8,N=10,pause=FALSE,steps=100)
thetaMCMC=showBinomialMCMC(thetastart=.2,S=8,N=10,pause=FALSE,steps=1000,showstep=4000)
# thetaMCMC=showBinomialMCMC(thetastart=.2,S=8,N=10,pause=FALSE,steps=4000,showstep=4000)

# Compare to histogram based on beta distribution
x11(height=8,width=8,xpos=200,ypos=500)
hist(x=randdraw,breaks=theta)

# Compare quantiles of beta distribution
quantile(x=randdraw,probs=c(0.025,0.975))
quantile(thetaMCMC,probs=c(.025,.975))

graphics.off()


# Posterior distribution of normal likelihood

# Assume a single observation y is drawn from a normal distribution, assume SD is known to be 25 and mean known to be 125.
# The likelihood function is the normal density.
y=110
mu=125
sigma=25
dnorm(x=y,mean=mu,sd=sigma)
dnorm(x=y,mean=mu,sd=sigma,log=TRUE)

allY=1:250
like=dnorm(x=allY,mean=mu,sd=sigma)
x11(height=5.5,width=7,xpos=1200,ypos=1)
# windows(height=5.5,width=7,xpos=1200,ypos=1)
plot(allY,like,pch=16,type="l",main="normal likelihood")
abline(v=mu)
sum(like)

# This is the probability of observing any value y given mu, sigma, and the normal assumption.
# To continue with the ideas demonstrated for the binomial, it is far more interesting to consider a series of values y.
# Imagine, for instance, 10 different animals were weighed and we are interested in estimating the mean and SD of the population
# by estimating posterior distributions.
sample=10
weights=rnorm(n=sample,mean=mu,sd=sigma)

# If the mean were 125 and the SD 25, the likelihood is
dnorm(x=weights,mean=mu,sd=sigma)
# There are 10 points so 10 likelihoods, and it is easier to work with log-likelihood from the start.
# We always need the sum of the log-likelihoods, which is the (log) likelihood of the entire set of data. The absolute
# probability is not useful. 
loglike=dnorm(x=weights,mean=mu,sd=sigma,log=TRUE)
totalLogLike=sum(loglike)

# Just like for the binomial, the posterior distribution starts by expressing likelihoods as a function of the parameters. But with
# the normal, there are two parameters. To make graphs, fix one of the two parameters. Start assuming that the SD is known (fixed)
# and examine a posterior distribution of the parameter mu.
allMu=1:250
postloglike=calcNormalManyMu(weights,mean=allMu,sd=sigma)
x11(height=5.5,width=7,xpos=1200,ypos=500)
# windows(height=5.5,width=7,xpos=1200,ypos=1)
plot(allMu,postloglike,type="l",pch=16,main="normal posterior likelihood (mean)")
plot(allMu,exp(postloglike),type="l",pch=16,main="normal posterior likelihood (mean)")
sum(exp(postloglike))
k=sum(exp(postloglike))
post=dnorm(allMu,mean=mean(weights),sd=sigma/sqrt(sample))
points(allMu,k*post)

# Repeat but now vary SD while fixing mu. Ie, the mean is known; examine the posterior distribution of the SD.
allSD=1:100
postloglike=calcNormalManySD(weights,mean=mu,sd=allSD)
x11(height=5.5,width=7,xpos=1200,ypos=500)
# windows(height=5.5,width=7,xpos=1200,ypos=500)
plot(allSD,postloglike,type="l",pch=16,main="normal posterior likelihood (SD)")
plot(allSD,exp(postloglike),type="l",pch=16,main="normal posterior likelihood (SD)")
sum(exp(postloglike))
k=sum(exp(postloglike))


graphics.off()


# Run a Gibbs sampler with Metropolis acceptance to calculate posterior distributions of mu and SD. The function
# normalPosterior.Gibbs is in RfuncBayesWorkshop.r.
GibbsResult=normalPosterior.Gibbs(data=weights,steps=20,showstep=20)
GibbsResult=normalPosterior.Gibbs(data=weights,steps=20,showstep=1,pause=FALSE)
GibbsResult

# Plot results of a long run
GibbsResult=normalPosterior.Gibbs(data=weights,steps=4000,showstep=4000,pause=FALSE)
colMeans(GibbsResult)
x11(height=5.5,width=7,xpos=1200,ypos=1)
# windows(height=5.5,width=7,xpos=1200,ypos=1)
plot(GibbsResult$muMCMC)
x11(height=5.5,width=7,xpos=1200,ypos=500)
plot(GibbsResult$sdMCMC)

# Adjusting the step size works much better
GibbsResult=normalPosterior.Gibbs(data=weights,steps=4000,showstep=4000,pause=FALSE,adjuststep=TRUE)
colMeans(GibbsResult)
dev.set(2)
plot(GibbsResult$muMCMC)
dev.set(3)
plot(GibbsResult$sdMCMC)

# Show the step size changes
dev.set(2)
plot(GibbsResult$stepsizeMu)
dev.set(3)
plot(GibbsResult$stepsizeSD)

# Likelihood profiles
dev.set(2)
plot(GibbsResult$muMCMC,GibbsResult$llikeMu)
dev.set(3)
plot(GibbsResult$sdMCMC,GibbsResult$llikeSD)

# A 3D likelihood profile
x11(height=8,width=10,xpos=500,ypos=1)
# windows(height=8,width=10,xpos=500,ypos=1)
maxllike=max(GibbsResult$llikeSD)
plot(GibbsResult$sdMCMC,GibbsResult$muMCMC,col="red",pch=1)
include=GibbsResult$llikeSD>(maxllike-4)
points(GibbsResult$sdMCMC[include],GibbsResult$muMCMC[include],col="lightgreen",pch=16)
include=GibbsResult$llikeSD>(maxllike-2)
points(GibbsResult$sdMCMC[include],GibbsResult$muMCMC[include],col="lightblue",pch=16)
include=GibbsResult$llikeSD>(maxllike-1)
points(GibbsResult$sdMCMC[include],GibbsResult$muMCMC[include],col="green",pch=16)
include=GibbsResult$llikeSD>(maxllike-.5)
points(GibbsResult$sdMCMC[include],GibbsResult$muMCMC[include],col="blue",pch=16)
include=GibbsResult$llikeSD>(maxllike-.25)
points(GibbsResult$sdMCMC[include],GibbsResult$muMCMC[include],col="black",pch=16)