Load necessary packages.

library(tidyverse)
library(gee)
library(lme4)

Why do a simulation study?

In a simulation study, we decide what the truth is. This allows us to do a lot!

Simple Linear Regression Model

Consider the following model:

\[Y_{i} = \beta_0 + \beta_1 X_i + \epsilon_i\] \[\epsilon_i's \stackrel{iid}{\sim} N(0, \sigma^2)\] \[i = 1, ..., n\]

First, set up some data generating parameters for our simulation.

set.seed(2187)                

n <- 50                           # Sample size
x1 <- runif(n, 0, 10)             # A covariate following Uniform (0,10)
X <- cbind(1, x1)                 # Design matrix (with an intercept)
sigma <- 5                        # Residual standard deviation
beta <- matrix(c(1, 2), ncol = 1) # True regression coefficient
mu <- X %*% beta                  # Mean
Y <- rnorm(n, mu, sigma)          # Outcome

Fit the model and visualize.

fit <- lm(Y ~ x1)

data.frame(Y, x1) %>%
  ggplot(aes(x = x1, y = Y)) +
  geom_point() +
  geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2], color = 'red') 

  #geom_smooth(method = 'lm') # ggplot can automatically draw lines for you

Run a simulation.

# Specify number of simulations
n.iter <- 5000

# Allocate storage for results
results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
  uncount(n.iter)

# Run simulation
for (i in 1:n.iter){
  if(i %% 1000 == 0){print(paste("Iteration", i))}  # Print updates
  Y <- rnorm (n, mu, sigma)                         # Simulate outcome
  fit <- lm(Y ~ x1)                                 # Fit model
  results[i,1:2] <- coef(fit)                       # Store coefficients  
  results[i,3:4] <- sqrt(diag(vcov(fit)))           # Store standard errors
}
## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"
head(results) 
##      Est.int    Est.x   SE.int      SE.x
## 1 -0.8246897 2.354340 1.254968 0.2212428
## 2  2.6877673 1.796402 1.367630 0.2411044
## 3  1.3660347 2.130822 1.391086 0.2452394
## 4  0.3641681 2.037344 1.421851 0.2506631
## 5  0.3334922 2.136970 1.372763 0.2420092
## 6  2.8221716 1.845620 1.460441 0.2574663

Calculate some inferential statistics. We expect 1) the biases to be close to zero, and 2) the empirical standard deviation of the estimates should be similar to the asymptotic standard error.

# Bias (average difference between estimated coefficients and true values)
bias <- colMeans(results[,1:2]) - beta

# Empirical standard error (std dev of estimates across simulations)
emp.se <- apply(results[,1:2], 2, sd)

# Mean of asymptotic standard error
asymp.se <- colMeans(results[,3:4])

round(data.frame(bias, emp.se, asymp.se), 3)
##          bias emp.se asymp.se
## Est.int 0.005  1.485    1.462
## Est.x   0.000  0.260    0.258

We can also check coverage probability. Let’s do it for the slope parameter for x. This should be close to 95%.

Upper95 <- results$Est.x + 1.96*results$SE.x
Lower95 <- results$Est.x - 1.96*results$SE.x
mean(Lower95 < 2 & Upper95 > 2)                 # Pretty close to 95%!
## [1] 0.9436

Questions

  • How would the bias and standard error compare if we
    • increase sample size n?
    • increase/decrease sigma?
    • change the range of x (e.g. x ~ Unif[0, 20])?

Distributional Assumption Violations

What happens if the residual error does not follow a normal distribution? Let’s consider the possibility of a t-distribution or log-normal distribution.

# Simulate normal random error
norm.err <- rnorm(n, 0, sigma)

# Simulate t(df = 5) random error
df <- 5
t.err <- rt(n, df = df) * sqrt((df-2)/df) * sigma

# Simulate log-normal random error
sd.log <- 1 # SD on the log-scale for log-normal
sd.lnorm <- sqrt((exp(sd.log^2)-1) * exp(sd.log^2)) # SD on the exp scale
lnorm.err <- (rlnorm(n, meanlog = 0, sdlog = sd.log) - exp(sd.log^2/2)) / sd.lnorm * sigma
  
data.frame(Error = c(norm.err, t.err, lnorm.err),
           Type = rep(c('Normal','t(5)','log-Normal'), each = n)) %>%
  ggplot(aes(x = Error, fill = Type)) +
  geom_density() +
  facet_wrap(~Type, scales = 'free_x')

Now lets conduct a simulation study to see how these violated assumptions affect our results.

results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
  uncount(n.iter)

# Run simulation with t(5) error
for (i in 1:n.iter){
  if(i %% 1000 == 0) print(paste("Iteration", i))   # Print updates
  Y <- mu + rt(n, df) * sqrt((df-2)/df) * sigma     # Simulate outcome
  fit <- lm(Y ~ x1)                                 # Fit model
  results[i,1:2] <- coef(fit)                       # Store coefficients  
  results[i,3:4] <- sqrt(diag(vcov(fit)))           # Store standard errors
}
## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"
bias <- colMeans(results[,1:2]) - beta
emp.se <- apply(results[,1:2], 2, sd)
asymp.se <- colMeans(results[,3:4])
round(data.frame(bias, emp.se, asymp.se), 3)
##           bias emp.se asymp.se
## Est.int  0.000  1.485    1.453
## Est.x   -0.003  0.260    0.256

Overall, we see that the estimates are unbiased and the asymptotic standard error is slightly under-estimated.

What happens if the residual error has a skewed distribution? We will assume a mean-zero log-Normal distribution that is centered to have mean zero and residual variance = \(\sigma^2\).

results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
  uncount(n.iter)

sd.log <- 1 #sd on the log-scale for log-normal
sd.lnorm <- sqrt((exp(sd.log^2)-1) *exp(sd.log^2))#Sd on the exp scale
  
for (i in 1:n.iter){
  if(i %% 1000 == 0) print(paste("Iteration", i)) 
  r <- (rlnorm(n, meanlog = 0, sdlog = sd.log) - exp(sd.log^2/2)) / sd.lnorm*sigma
  Y <- mu + r
  fit <- lm(Y ~ x1)                                 # Fit model
  results[i,1:2] <- coef(fit)                       # Store coefficients  
  results[i,3:4] <- sqrt(diag(vcov(fit)))           # Store standard errors
}
## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"
bias <- colMeans(results[,1:2]) - beta
emp.se <- apply(results[,1:2], 2, sd)
asymp.se <- colMeans(results[,3:4])
round(data.frame(bias, emp.se, asymp.se), 3)
##           bias emp.se asymp.se
## Est.int  0.042  1.541    1.366
## Est.x   -0.005  0.267    0.241

Again, bias is still small and the asymptotic standard errors are under-estimated.

Impacts of Clustering

We assume data are generated from 50 clusters of size 5, and an exchangeable correlation \(\rho\). We will evaluate the impacts on inference with different values of \(\rho\).

In addition to the simple linear model, we have the random intercept model: \[Y_{ij} = \theta_i + \beta_0 + \beta_1 \times X_{ij} + \epsilon_{ij}\] \[\epsilon_{ij} \sim N(0,\sigma^2), \quad \theta_i \sim N(0, \tau^2)\]

The corresponding GEE model with exchangeable correlation structure: \[E\left[Y_{ij}\right] = \beta_0 + \beta_1 \times X_{ij}\] \[Cov\left[Y_{ij}\right] = \sigma^2\mathbf{R}_0\] \[\mathbf{R}_0 = (1-\rho)\mathbf{I}_5 + \rho\mathbf{J}_5\] For \(i = 1, ..., 50\) and \(j = 1, ... 5\). Recall that \(\rho = \frac{\tau^2}{\tau^2 + \sigma^2}\). This implies \(\tau^2 = \frac{\rho\sigma^2}{1 - \rho}\).

# Set up parameters
n.iter <- 10000    # Number of simulations
m <- 50           # Number of clusters
n <- 5            # Cluster size
ID <- rep(1:m, each = 5)


x1 <- runif(n*m, 0, 10)   # Covariate changes within cluster
X <- cbind(1, x1) 
mu <- X %*% beta

sigma <- 5
rho.list <- seq(0.1, 0.9, by = 0.1)             # List of rho's to test
tau.list <- sqrt(rho.list*sigma^2/(1-rho.list)) # corresponding random intercept standard deviation

res.lm <- data.frame(rho = rho.list, 
                     Est.int = NA, Est.x = NA, 
                     Emp.int = NA, Emp.x = NA, Asymp.int = NA, Asymp.x = NA)
res.lmer <- res.lm
res.gee <- res.lm

# Run Simulation for each tau
for (i in 1:length(rho.list)) {

  rho.i <- rho.list[i]
  tau.i <- tau.list[i]

  print(paste("Working on rho =", rho.list[i]))
  
  tmp.lm <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
    uncount(n.iter)
  tmp.lmer <- tmp.lm
  tmp.gee <- tmp.lm
  
  # Run Simulation
  for (j in 1:n.iter){

    # if (j %% 250 == 0) print(paste("Iteration", j))

    # Simulate random intercepts
    theta.i <- rep(rnorm(m, 0, tau.i), each = n)
    
    # Simulate outcome
    Y <- rnorm(m*n, theta.i + mu, sigma)

    # Simpler linear model
    fit.lm <- lm(Y ~ x1)
    
    # Random intercept model
    fit.lmer <- SimDesign::quiet(lmer(Y ~ x1 + (1|ID)))
    
    # GEE
    fit.gee <- SimDesign::quiet(gee(Y ~ x1, id = ID, corstr = 'exchangeable'))
    
    # Store results
    tmp.lm[j,1:2] <- coef(fit.lm)
    tmp.lm[j,3:4] <- sqrt(diag(vcov(fit.lm)))
    tmp.lmer[j,1:2] <- fixef(fit.lmer)
    tmp.lmer[j,3:4] <- sqrt(diag(vcov(fit.lmer)))
    tmp.gee[j,1:2] <- coef(fit.gee)
    tmp.gee[j,3:4] <- sqrt(diag(fit.gee$robust.variance))
  }
  
  # Calculate statistics
  est.lm <- colMeans(tmp.lm[,1:2])
  est.lmer <- colMeans(tmp.lmer[,1:2])
  est.gee <- colMeans(tmp.gee[,1:2])
  
  emp.lm <- apply(tmp.lm[,1:2], 2, sd)
  emp.lmer <- apply(tmp.lmer[,1:2], 2, sd)
  emp.gee <- apply(tmp.gee[,1:2], 2, sd)
  
  asymp.lm <- colMeans(tmp.lm[,3:4])
  asymp.lmer <- colMeans(tmp.lmer[,3:4])
  asymp.gee <- colMeans(tmp.gee[,3:4])
  
  # Store results
  res.lm[i,] <- c(rho.i, est.lm, emp.lm, asymp.lm)
  res.lmer[i,] <- c(rho.i, est.lmer, emp.lmer, asymp.lmer)
  res.gee[i,] <- c(rho.i, est.gee, emp.gee, asymp.gee)
}
## [1] "Working on rho = 0.1"
## [1] "Working on rho = 0.2"
## [1] "Working on rho = 0.3"
## [1] "Working on rho = 0.4"
## [1] "Working on rho = 0.5"
## [1] "Working on rho = 0.6"
## [1] "Working on rho = 0.7"
## [1] "Working on rho = 0.8"
## [1] "Working on rho = 0.9"

Now, let’s visualize the results.

res.all <- bind_rows(res.lm, res.lmer, res.gee) %>%
  mutate(Model = rep(c('LM','Random Intercept','GEE'), each = length(rho.list))) %>%
  pivot_longer(cols = c(Est.int, Est.x, Emp.int, Emp.x, Asymp.int, Asymp.x)) %>%
  separate(name, c('Type','Parameter'))

# Plot intercept estimates against rho
res.all %>%
  filter(Type == 'Est') %>%
  mutate(Truth = ifelse(Parameter == 'int', beta[1], beta[2])) %>%
  ggplot(aes(x = rho, y = value, color = Model)) +
  geom_line(alpha = 0.7) +
  geom_hline(aes(yintercept = Truth), color = 'red', lty = 2) +
  scale_x_continuous(breaks = rho.list) +
  scale_y_continuous(expand = c(.1, .1)) +
  facet_wrap(~Parameter, scales = 'free_y') +
  theme_bw() + 
  labs(x = 'Rho', y = 'Average Parameter Estimate')

# Plot empirical and asymptotic standard errors
res.all %>%
  filter(Type != 'Est') %>%
  mutate(Label = paste(Model, '-', Type)) %>%
  ggplot(aes(x = rho, y = value, color = Label, lty = Label)) +
  geom_line(alpha = 0.7) +
  scale_x_continuous(breaks = rho.list) +
  scale_color_manual(values = rep(scales::hue_pal()(3), each = 2)) +
  scale_linetype_manual(values = rep(c(1,2), times = 3)) +
  facet_wrap(~Parameter, scales = 'free_y') +
  theme_bw() +
  theme(legend.position = 'bottom') +
  labs(x = 'Rho', y = 'Average Standard Error', color = 'Model/Type', lty = 'Model/Type')