Load necessary packages.

library(tidyverse)
library(gee)
library(lme4)

Why do a simulation study?

In a simulation study, we decide what the truth is. This allows us to do a lot!

Convenient (although computationally expensive and inexact) way to conduct power analyses
Assessment of asymptotic properties that we sometimes take for granted
Model comparison in terms of accuracy and efficiency

Simple Linear Regression Model

Consider the following model:

First, set up some data generating parameters for our simulation.

set.seed(2187)                

n <- 50                           # Sample size
x1 <- runif(n, 0, 10)             # A covariate following Uniform (0,10)
X <- cbind(1, x1)                 # Design matrix (with an intercept)
sigma <- 5                        # Residual standard deviation
beta <- matrix(c(1, 2), ncol = 1) # True regression coefficient
mu <- X %*% beta                  # Mean
Y <- rnorm(n, mu, sigma)          # Outcome

Fit the model and visualize.

fit <- lm(Y ~ x1)

data.frame(Y, x1) %>%
  ggplot(aes(x = x1, y = Y)) +
  geom_point() +
  geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2], color = 'red')

  #geom_smooth(method = 'lm') # ggplot can automatically draw lines for you

Run a simulation.

# Specify number of simulations
n.iter <- 5000

# Allocate storage for results
results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
  uncount(n.iter)

# Run simulation
for (i in 1:n.iter){
  if(i %% 1000 == 0){print(paste("Iteration", i))}  # Print updates
  Y <- rnorm (n, mu, sigma)                         # Simulate outcome
  fit <- lm(Y ~ x1)                                 # Fit model
  results[i,1:2] <- coef(fit)                       # Store coefficients  
  results[i,3:4] <- sqrt(diag(vcov(fit)))           # Store standard errors
}

## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"

head(results)

##      Est.int    Est.x   SE.int      SE.x
## 1 -0.8246897 2.354340 1.254968 0.2212428
## 2  2.6877673 1.796402 1.367630 0.2411044
## 3  1.3660347 2.130822 1.391086 0.2452394
## 4  0.3641681 2.037344 1.421851 0.2506631
## 5  0.3334922 2.136970 1.372763 0.2420092
## 6  2.8221716 1.845620 1.460441 0.2574663

Calculate some inferential statistics. We expect 1) the biases to be close to zero, and 2) the empirical standard deviation of the estimates should be similar to the asymptotic standard error.

# Bias (average difference between estimated coefficients and true values)
bias <- colMeans(results[,1:2]) - beta

# Empirical standard error (std dev of estimates across simulations)
emp.se <- apply(results[,1:2], 2, sd)

# Mean of asymptotic standard error
asymp.se <- colMeans(results[,3:4])

round(data.frame(bias, emp.se, asymp.se), 3)

##          bias emp.se asymp.se
## Est.int 0.005  1.485    1.462
## Est.x   0.000  0.260    0.258

We can also check coverage probability. Let’s do it for the slope parameter for x. This should be close to 95%.

Upper95 <- results$Est.x + 1.96*results$SE.x
Lower95 <- results$Est.x - 1.96*results$SE.x
mean(Lower95 < 2 & Upper95 > 2)                 # Pretty close to 95%!

## [1] 0.9436

Questions

How would the bias and standard error compare if we
- increase sample size n?
- increase/decrease sigma?
- change the range of x (e.g. x ~ Unif[0, 20])?

Distributional Assumption Violations

What happens if the residual error does not follow a normal distribution? Let’s consider the possibility of a t-distribution or log-normal distribution.

# Simulate normal random error
norm.err <- rnorm(n, 0, sigma)

# Simulate t(df = 5) random error
df <- 5
t.err <- rt(n, df = df) * sqrt((df-2)/df) * sigma

# Simulate log-normal random error
sd.log <- 1 # SD on the log-scale for log-normal
sd.lnorm <- sqrt((exp(sd.log^2)-1) * exp(sd.log^2)) # SD on the exp scale
lnorm.err <- (rlnorm(n, meanlog = 0, sdlog = sd.log) - exp(sd.log^2/2)) / sd.lnorm * sigma
  
data.frame(Error = c(norm.err, t.err, lnorm.err),
           Type = rep(c('Normal','t(5)','log-Normal'), each = n)) %>%
  ggplot(aes(x = Error, fill = Type)) +
  geom_density() +
  facet_wrap(~Type, scales = 'free_x')

Now lets conduct a simulation study to see how these violated assumptions affect our results.

results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
  uncount(n.iter)

# Run simulation with t(5) error
for (i in 1:n.iter){
  if(i %% 1000 == 0) print(paste("Iteration", i))   # Print updates
  Y <- mu + rt(n, df) * sqrt((df-2)/df) * sigma     # Simulate outcome
  fit <- lm(Y ~ x1)                                 # Fit model
  results[i,1:2] <- coef(fit)                       # Store coefficients  
  results[i,3:4] <- sqrt(diag(vcov(fit)))           # Store standard errors
}

## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"

bias <- colMeans(results[,1:2]) - beta
emp.se <- apply(results[,1:2], 2, sd)
asymp.se <- colMeans(results[,3:4])
round(data.frame(bias, emp.se, asymp.se), 3)

##           bias emp.se asymp.se
## Est.int  0.000  1.485    1.453
## Est.x   -0.003  0.260    0.256

Overall, we see that the estimates are unbiased and the asymptotic standard error is slightly under-estimated.

What happens if the residual error has a skewed distribution? We will assume a mean-zero log-Normal distribution that is centered to have mean zero and residual variance = .

results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
  uncount(n.iter)

sd.log <- 1 #sd on the log-scale for log-normal
sd.lnorm <- sqrt((exp(sd.log^2)-1) *exp(sd.log^2))#Sd on the exp scale
  
for (i in 1:n.iter){
  if(i %% 1000 == 0) print(paste("Iteration", i)) 
  r <- (rlnorm(n, meanlog = 0, sdlog = sd.log) - exp(sd.log^2/2)) / sd.lnorm*sigma
  Y <- mu + r
  fit <- lm(Y ~ x1)                                 # Fit model
  results[i,1:2] <- coef(fit)                       # Store coefficients  
  results[i,3:4] <- sqrt(diag(vcov(fit)))           # Store standard errors
}

## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"

bias <- colMeans(results[,1:2]) - beta
emp.se <- apply(results[,1:2], 2, sd)
asymp.se <- colMeans(results[,3:4])
round(data.frame(bias, emp.se, asymp.se), 3)

##           bias emp.se asymp.se
## Est.int  0.042  1.541    1.366
## Est.x   -0.005  0.267    0.241

Again, bias is still small and the asymptotic standard errors are under-estimated.

Impacts of Clustering

We assume data are generated from 50 clusters of size 5, and an exchangeable correlation . We will evaluate the impacts on inference with different values of .

In addition to the simple linear model, we have the random intercept model:

The corresponding GEE model with exchangeable correlation structure:

For

and

. Recall that

. This implies

# Set up parameters
n.iter <- 10000    # Number of simulations
m <- 50           # Number of clusters
n <- 5            # Cluster size
ID <- rep(1:m, each = 5)


x1 <- runif(n*m, 0, 10)   # Covariate changes within cluster
X <- cbind(1, x1) 
mu <- X %*% beta

sigma <- 5
rho.list <- seq(0.1, 0.9, by = 0.1)             # List of rho's to test
tau.list <- sqrt(rho.list*sigma^2/(1-rho.list)) # corresponding random intercept standard deviation

res.lm <- data.frame(rho = rho.list, 
                     Est.int = NA, Est.x = NA, 
                     Emp.int = NA, Emp.x = NA, Asymp.int = NA, Asymp.x = NA)
res.lmer <- res.lm
res.gee <- res.lm

# Run Simulation for each tau
for (i in 1:length(rho.list)) {

  rho.i <- rho.list[i]
  tau.i <- tau.list[i]

  print(paste("Working on rho =", rho.list[i]))
  
  tmp.lm <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
    uncount(n.iter)
  tmp.lmer <- tmp.lm
  tmp.gee <- tmp.lm
  
  # Run Simulation
  for (j in 1:n.iter){

    # if (j %% 250 == 0) print(paste("Iteration", j))

    # Simulate random intercepts
    theta.i <- rep(rnorm(m, 0, tau.i), each = n)
    
    # Simulate outcome
    Y <- rnorm(m*n, theta.i + mu, sigma)

    # Simpler linear model
    fit.lm <- lm(Y ~ x1)
    
    # Random intercept model
    fit.lmer <- SimDesign::quiet(lmer(Y ~ x1 + (1|ID)))
    
    # GEE
    fit.gee <- SimDesign::quiet(gee(Y ~ x1, id = ID, corstr = 'exchangeable'))
    
    # Store results
    tmp.lm[j,1:2] <- coef(fit.lm)
    tmp.lm[j,3:4] <- sqrt(diag(vcov(fit.lm)))
    tmp.lmer[j,1:2] <- fixef(fit.lmer)
    tmp.lmer[j,3:4] <- sqrt(diag(vcov(fit.lmer)))
    tmp.gee[j,1:2] <- coef(fit.gee)
    tmp.gee[j,3:4] <- sqrt(diag(fit.gee$robust.variance))
  }
  
  # Calculate statistics
  est.lm <- colMeans(tmp.lm[,1:2])
  est.lmer <- colMeans(tmp.lmer[,1:2])
  est.gee <- colMeans(tmp.gee[,1:2])
  
  emp.lm <- apply(tmp.lm[,1:2], 2, sd)
  emp.lmer <- apply(tmp.lmer[,1:2], 2, sd)
  emp.gee <- apply(tmp.gee[,1:2], 2, sd)
  
  asymp.lm <- colMeans(tmp.lm[,3:4])
  asymp.lmer <- colMeans(tmp.lmer[,3:4])
  asymp.gee <- colMeans(tmp.gee[,3:4])
  
  # Store results
  res.lm[i,] <- c(rho.i, est.lm, emp.lm, asymp.lm)
  res.lmer[i,] <- c(rho.i, est.lmer, emp.lmer, asymp.lmer)
  res.gee[i,] <- c(rho.i, est.gee, emp.gee, asymp.gee)
}

## [1] "Working on rho = 0.1"
## [1] "Working on rho = 0.2"
## [1] "Working on rho = 0.3"
## [1] "Working on rho = 0.4"
## [1] "Working on rho = 0.5"
## [1] "Working on rho = 0.6"
## [1] "Working on rho = 0.7"
## [1] "Working on rho = 0.8"
## [1] "Working on rho = 0.9"

Now, let’s visualize the results.

res.all <- bind_rows(res.lm, res.lmer, res.gee) %>%
  mutate(Model = rep(c('LM','Random Intercept','GEE'), each = length(rho.list))) %>%
  pivot_longer(cols = c(Est.int, Est.x, Emp.int, Emp.x, Asymp.int, Asymp.x)) %>%
  separate(name, c('Type','Parameter'))

# Plot intercept estimates against rho
res.all %>%
  filter(Type == 'Est') %>%
  mutate(Truth = ifelse(Parameter == 'int', beta[1], beta[2])) %>%
  ggplot(aes(x = rho, y = value, color = Model)) +
  geom_line(alpha = 0.7) +
  geom_hline(aes(yintercept = Truth), color = 'red', lty = 2) +
  scale_x_continuous(breaks = rho.list) +
  scale_y_continuous(expand = c(.1, .1)) +
  facet_wrap(~Parameter, scales = 'free_y') +
  theme_bw() + 
  labs(x = 'Rho', y = 'Average Parameter Estimate')

Observations
- Both standard linear and random intercept models give close to unbiased intercept and slope estimates.
- The intercept estimates are more noisy because of larger Monte Carlo errors when is large.

# Plot empirical and asymptotic standard errors
res.all %>%
  filter(Type != 'Est') %>%
  mutate(Label = paste(Model, '-', Type)) %>%
  ggplot(aes(x = rho, y = value, color = Label, lty = Label)) +
  geom_line(alpha = 0.7) +
  scale_x_continuous(breaks = rho.list) +
  scale_color_manual(values = rep(scales::hue_pal()(3), each = 2)) +
  scale_linetype_manual(values = rep(c(1,2), times = 3)) +
  facet_wrap(~Parameter, scales = 'free_y') +
  theme_bw() +
  theme(legend.position = 'bottom') +
  labs(x = 'Rho', y = 'Average Standard Error', color = 'Model/Type', lty = 'Model/Type')

Observations for estimating the intercept
- Linear regression’s asymptotic standard error will be an underestimate
- Random intercept and GEE robust standard error are accurate estimates
- Random intercept and GEE robust standard error are very similar since this is a ``normal” outcome
- Standard error for the intercept increases with larger
Observations for estimating the slope
- Linear regression has larger standard error (less efficient) compare to the random intercept model and GEE, especially with increasing
- Random Intercept model and GEE are very similar, and standard errors do not change much with

Conducting Simulation Studies in R

Why do a simulation study?

Simple Linear Regression Model

Questions

Distributional Assumption Violations

Impacts of Clustering