##### simulation.R #####
# Example code for simulations: boostrap, power, sensitivity.
#

##### Simulations: How #####

# 1. create (or find) a distribution to sample from
d1 = rnorm(n=1000,sd=2,mean=10)
d2 = c(rep(1,100),rep(2,200),rep(3,300),rep(4,400))
d3 = rbinom(n=1000,p=0.2,size=1)
d4 = rpois(n=1000,lambda=1+d3)
  hist(d1)
  hist(d2)
  hist(d3)
  hist(d4)
  
# 2. Sample from the distribution using a function
s1 = sample(d1, 1000, replace=TRUE)
s2 = sample(d2, 1000, replace=TRUE)
s3 = sample(d3, 1000, replace=TRUE)
s4 = sample(d4, 1000, replace=TRUE)
  
# 3. Do something with the samples
mean(s1)
sd(s1)
glm(s4 ~ s3)

##### Example: Bootstrap #####
# Bootstrap a confidence interval for the sample mean (n=100)
  
  # Step 1. write a function to take a sample
  my.sample = function(x) {
    sample(x,100)
  }
  # Step 2. write a function to calculate a statistic form a sample
  my.stat = function(x) {
    mean(x)
  }
  # Step 3. combine 1 and 2, and repeat many times
  sims = replicate(1000,my.sample(d1))
  results = apply(sims,2,my.stat)
  
  # Step 4. Summarize the distribution of results
  mean(results)
  sd(results)
  2/(100^0.5) # exact calculation
  quantile(results,probs = c(0.025,0.975))
        
# Boostrap a cauchy distribution for a ratio confidence interval
# (Cauchy: can be expressed as a ratio of two normal variables)
# Application: calculating a risk ratio form two modeled risks
# Risk A: 2.0 (95% CI 1.8 to 2.2) 
# Risk B: 1.2 (95% CI 1.1 to 1.3)
# What is the CI for RR A/B?
sd.a = (2.2-1.8)/(2*qnorm(0.975))
sd.b = (1.3-1.1)/(2*qnorm(0.975))
# simulate distribution of risks (obtained from model)
risk.a = rnorm(mean=2, sd=sd.a, n=100000)
risk.b = rnorm(mean=1.2, sd=sd.b, n=10000)
# inspect the modeled risk distribution
hist(risk.a,xlim=c(0,4),ylim=c(0,10),freq=FALSE,col="red")
hist(risk.b,add=TRUE,freq=FALSE,col="blue")

# Step 1&2. write a function to sample and calculate statistic
my.sample = function(a,b) {
  sample(a,1) / sample(b,1)
}

# Step 3. combine 1 and 2, and repeat many times
results = replicate(1000,my.sample(risk.a,risk.b))

# Step 4. Summarize the distribution of results
hist(results)
mean(results)
quantile(results,probs = c(0.025,0.975))  

##### Example: Sensitivity #####
# How sensitive are my results to measurement error?

# Simulate an example dataset (you wouldn't do this part)
exposure = rbinom(p=0.4,n=1000,size=1)  # 40% exposure prevalence
error = rnorm(n=1000)
covariate = rbinom(p=0.2,n=1000,size=1)
outcome = rbinom(p=0.2*exposure+0.1*covariate+0.1*error>0, size=1, n=1000)
# you would just use your data instead
        
# crude model
m0 = glm(outcome ~ exposure + covariate)
summary(m0)   

# measurement error model (20% Misclassification at random)
exp.wrong = exposure                      # make a copy to mess up
wrongs = sample(1:1000,size=200)          # sample indices to flip
exp.wrong[wrongs] = 1-exp.wrong[wrongs]   # flip errors
  
m1 = glm(outcome ~ exp.wrong + covariate)
summary(m1)   # Bias towards the null! But not enough information...

# Characterize based on a simulation: what is the distribution under MAR?
my.sample = function() {
  ex = exposure                      # make a copy to mess up
  flip = sample(1:1000,size=200)          # sample indices to flip
  ex[flip] = 1-ex[flip]   # flip errors
  me = glm(outcome ~ ex + covariate)
  coef(me)[2] - coef(m0)[2]
}
# run the simulation 1000 times (takes a while)
results = replicate(1000,my.sample())
mean(results)   # mean for ME
quantile(results,probs = c(0.025,0.975))  # confidence bounds on ME
                
##### Example: Power #####
# How to calculate power for a study with confounding?
# Select power assumptions (add your own):
exposure.prevalence = 0.5
baseout.prevalence=0.1
outcome.prevalence = 0.2
prior.RD = 0.1
confounding = 0.1
confounder.prevalence = 0.3

# build model for data generation process (simulate this part)
lower.bound = function() {
confounder = rbinom(p=confounder.prevalence,n=1000,size=1)
exposure = rbinom(p=exposure.prevalence+confounding*confounder,n=1000,size=1)
outcome = rbinom(p=baseout.prevalence+prior.RD*exposure+confounding*confounder,n=1000,size=1)
  confint.default(glm(outcome~exposure+confounder,family=binomial("logit")))[2,1]
}

results = replicate(3000,lower.bound())
mean(results>0)  # about 98% power to detect

##### Challenge: Estimate 95% CI for LB and UB (instead of power)