Load necessary packages.
library(tidyverse)
library(gee)
library(lme4)
In a simulation study, we decide what the truth is. This allows us to do a lot!
Consider the following model:
\[Y_{i} = \beta_0 + \beta_1 X_i + \epsilon_i\] \[\epsilon_i's \stackrel{iid}{\sim} N(0, \sigma^2)\] \[i = 1, ..., n\]
First, set up some data generating parameters for our simulation.
set.seed(2187)
n <- 50 # Sample size
x1 <- runif(n, 0, 10) # A covariate following Uniform (0,10)
X <- cbind(1, x1) # Design matrix (with an intercept)
sigma <- 5 # Residual standard deviation
beta <- matrix(c(1, 2), ncol = 1) # True regression coefficient
mu <- X %*% beta # Mean
Y <- rnorm(n, mu, sigma) # Outcome
Fit the model and visualize.
fit <- lm(Y ~ x1)
data.frame(Y, x1) %>%
ggplot(aes(x = x1, y = Y)) +
geom_point() +
geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2], color = 'red')
#geom_smooth(method = 'lm') # ggplot can automatically draw lines for you
Run a simulation.
# Specify number of simulations
n.iter <- 5000
# Allocate storage for results
results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
uncount(n.iter)
# Run simulation
for (i in 1:n.iter){
if(i %% 1000 == 0){print(paste("Iteration", i))} # Print updates
Y <- rnorm (n, mu, sigma) # Simulate outcome
fit <- lm(Y ~ x1) # Fit model
results[i,1:2] <- coef(fit) # Store coefficients
results[i,3:4] <- sqrt(diag(vcov(fit))) # Store standard errors
}
## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"
head(results)
## Est.int Est.x SE.int SE.x
## 1 -0.8246897 2.354340 1.254968 0.2212428
## 2 2.6877673 1.796402 1.367630 0.2411044
## 3 1.3660347 2.130822 1.391086 0.2452394
## 4 0.3641681 2.037344 1.421851 0.2506631
## 5 0.3334922 2.136970 1.372763 0.2420092
## 6 2.8221716 1.845620 1.460441 0.2574663
Calculate some inferential statistics. We expect 1) the biases to be close to zero, and 2) the empirical standard deviation of the estimates should be similar to the asymptotic standard error.
# Bias (average difference between estimated coefficients and true values)
bias <- colMeans(results[,1:2]) - beta
# Empirical standard error (std dev of estimates across simulations)
emp.se <- apply(results[,1:2], 2, sd)
# Mean of asymptotic standard error
asymp.se <- colMeans(results[,3:4])
round(data.frame(bias, emp.se, asymp.se), 3)
## bias emp.se asymp.se
## Est.int 0.005 1.485 1.462
## Est.x 0.000 0.260 0.258
We can also check coverage probability. Let’s do it for the slope parameter for x. This should be close to 95%.
Upper95 <- results$Est.x + 1.96*results$SE.x
Lower95 <- results$Est.x - 1.96*results$SE.x
mean(Lower95 < 2 & Upper95 > 2) # Pretty close to 95%!
## [1] 0.9436
What happens if the residual error does not follow a normal distribution? Let’s consider the possibility of a t-distribution or log-normal distribution.
# Simulate normal random error
norm.err <- rnorm(n, 0, sigma)
# Simulate t(df = 5) random error
df <- 5
t.err <- rt(n, df = df) * sqrt((df-2)/df) * sigma
# Simulate log-normal random error
sd.log <- 1 # SD on the log-scale for log-normal
sd.lnorm <- sqrt((exp(sd.log^2)-1) * exp(sd.log^2)) # SD on the exp scale
lnorm.err <- (rlnorm(n, meanlog = 0, sdlog = sd.log) - exp(sd.log^2/2)) / sd.lnorm * sigma
data.frame(Error = c(norm.err, t.err, lnorm.err),
Type = rep(c('Normal','t(5)','log-Normal'), each = n)) %>%
ggplot(aes(x = Error, fill = Type)) +
geom_density() +
facet_wrap(~Type, scales = 'free_x')
Now lets conduct a simulation study to see how these violated assumptions affect our results.
results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
uncount(n.iter)
# Run simulation with t(5) error
for (i in 1:n.iter){
if(i %% 1000 == 0) print(paste("Iteration", i)) # Print updates
Y <- mu + rt(n, df) * sqrt((df-2)/df) * sigma # Simulate outcome
fit <- lm(Y ~ x1) # Fit model
results[i,1:2] <- coef(fit) # Store coefficients
results[i,3:4] <- sqrt(diag(vcov(fit))) # Store standard errors
}
## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"
bias <- colMeans(results[,1:2]) - beta
emp.se <- apply(results[,1:2], 2, sd)
asymp.se <- colMeans(results[,3:4])
round(data.frame(bias, emp.se, asymp.se), 3)
## bias emp.se asymp.se
## Est.int 0.000 1.485 1.453
## Est.x -0.003 0.260 0.256
Overall, we see that the estimates are unbiased and the asymptotic standard error is slightly under-estimated.
What happens if the residual error has a skewed distribution? We will assume a mean-zero log-Normal distribution that is centered to have mean zero and residual variance = \(\sigma^2\).
results <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
uncount(n.iter)
sd.log <- 1 #sd on the log-scale for log-normal
sd.lnorm <- sqrt((exp(sd.log^2)-1) *exp(sd.log^2))#Sd on the exp scale
for (i in 1:n.iter){
if(i %% 1000 == 0) print(paste("Iteration", i))
r <- (rlnorm(n, meanlog = 0, sdlog = sd.log) - exp(sd.log^2/2)) / sd.lnorm*sigma
Y <- mu + r
fit <- lm(Y ~ x1) # Fit model
results[i,1:2] <- coef(fit) # Store coefficients
results[i,3:4] <- sqrt(diag(vcov(fit))) # Store standard errors
}
## [1] "Iteration 1000"
## [1] "Iteration 2000"
## [1] "Iteration 3000"
## [1] "Iteration 4000"
## [1] "Iteration 5000"
bias <- colMeans(results[,1:2]) - beta
emp.se <- apply(results[,1:2], 2, sd)
asymp.se <- colMeans(results[,3:4])
round(data.frame(bias, emp.se, asymp.se), 3)
## bias emp.se asymp.se
## Est.int 0.042 1.541 1.366
## Est.x -0.005 0.267 0.241
Again, bias is still small and the asymptotic standard errors are under-estimated.
We assume data are generated from 50 clusters of size 5, and an exchangeable correlation \(\rho\). We will evaluate the impacts on inference with different values of \(\rho\).
In addition to the simple linear model, we have the random intercept model: \[Y_{ij} = \theta_i + \beta_0 + \beta_1 \times X_{ij} + \epsilon_{ij}\] \[\epsilon_{ij} \sim N(0,\sigma^2), \quad \theta_i \sim N(0, \tau^2)\]
The corresponding GEE model with exchangeable correlation structure: \[E\left[Y_{ij}\right] = \beta_0 + \beta_1 \times X_{ij}\] \[Cov\left[Y_{ij}\right] = \sigma^2\mathbf{R}_0\] \[\mathbf{R}_0 = (1-\rho)\mathbf{I}_5 + \rho\mathbf{J}_5\] For \(i = 1, ..., 50\) and \(j = 1, ... 5\). Recall that \(\rho = \frac{\tau^2}{\tau^2 + \sigma^2}\). This implies \(\tau^2 = \frac{\rho\sigma^2}{1 - \rho}\).
# Set up parameters
n.iter <- 10000 # Number of simulations
m <- 50 # Number of clusters
n <- 5 # Cluster size
ID <- rep(1:m, each = 5)
x1 <- runif(n*m, 0, 10) # Covariate changes within cluster
X <- cbind(1, x1)
mu <- X %*% beta
sigma <- 5
rho.list <- seq(0.1, 0.9, by = 0.1) # List of rho's to test
tau.list <- sqrt(rho.list*sigma^2/(1-rho.list)) # corresponding random intercept standard deviation
res.lm <- data.frame(rho = rho.list,
Est.int = NA, Est.x = NA,
Emp.int = NA, Emp.x = NA, Asymp.int = NA, Asymp.x = NA)
res.lmer <- res.lm
res.gee <- res.lm
# Run Simulation for each tau
for (i in 1:length(rho.list)) {
rho.i <- rho.list[i]
tau.i <- tau.list[i]
print(paste("Working on rho =", rho.list[i]))
tmp.lm <- data.frame(Est.int = NA, Est.x = NA, SE.int = NA, SE.x = NA) %>%
uncount(n.iter)
tmp.lmer <- tmp.lm
tmp.gee <- tmp.lm
# Run Simulation
for (j in 1:n.iter){
# if (j %% 250 == 0) print(paste("Iteration", j))
# Simulate random intercepts
theta.i <- rep(rnorm(m, 0, tau.i), each = n)
# Simulate outcome
Y <- rnorm(m*n, theta.i + mu, sigma)
# Simpler linear model
fit.lm <- lm(Y ~ x1)
# Random intercept model
fit.lmer <- SimDesign::quiet(lmer(Y ~ x1 + (1|ID)))
# GEE
fit.gee <- SimDesign::quiet(gee(Y ~ x1, id = ID, corstr = 'exchangeable'))
# Store results
tmp.lm[j,1:2] <- coef(fit.lm)
tmp.lm[j,3:4] <- sqrt(diag(vcov(fit.lm)))
tmp.lmer[j,1:2] <- fixef(fit.lmer)
tmp.lmer[j,3:4] <- sqrt(diag(vcov(fit.lmer)))
tmp.gee[j,1:2] <- coef(fit.gee)
tmp.gee[j,3:4] <- sqrt(diag(fit.gee$robust.variance))
}
# Calculate statistics
est.lm <- colMeans(tmp.lm[,1:2])
est.lmer <- colMeans(tmp.lmer[,1:2])
est.gee <- colMeans(tmp.gee[,1:2])
emp.lm <- apply(tmp.lm[,1:2], 2, sd)
emp.lmer <- apply(tmp.lmer[,1:2], 2, sd)
emp.gee <- apply(tmp.gee[,1:2], 2, sd)
asymp.lm <- colMeans(tmp.lm[,3:4])
asymp.lmer <- colMeans(tmp.lmer[,3:4])
asymp.gee <- colMeans(tmp.gee[,3:4])
# Store results
res.lm[i,] <- c(rho.i, est.lm, emp.lm, asymp.lm)
res.lmer[i,] <- c(rho.i, est.lmer, emp.lmer, asymp.lmer)
res.gee[i,] <- c(rho.i, est.gee, emp.gee, asymp.gee)
}
## [1] "Working on rho = 0.1"
## [1] "Working on rho = 0.2"
## [1] "Working on rho = 0.3"
## [1] "Working on rho = 0.4"
## [1] "Working on rho = 0.5"
## [1] "Working on rho = 0.6"
## [1] "Working on rho = 0.7"
## [1] "Working on rho = 0.8"
## [1] "Working on rho = 0.9"
Now, let’s visualize the results.
res.all <- bind_rows(res.lm, res.lmer, res.gee) %>%
mutate(Model = rep(c('LM','Random Intercept','GEE'), each = length(rho.list))) %>%
pivot_longer(cols = c(Est.int, Est.x, Emp.int, Emp.x, Asymp.int, Asymp.x)) %>%
separate(name, c('Type','Parameter'))
# Plot intercept estimates against rho
res.all %>%
filter(Type == 'Est') %>%
mutate(Truth = ifelse(Parameter == 'int', beta[1], beta[2])) %>%
ggplot(aes(x = rho, y = value, color = Model)) +
geom_line(alpha = 0.7) +
geom_hline(aes(yintercept = Truth), color = 'red', lty = 2) +
scale_x_continuous(breaks = rho.list) +
scale_y_continuous(expand = c(.1, .1)) +
facet_wrap(~Parameter, scales = 'free_y') +
theme_bw() +
labs(x = 'Rho', y = 'Average Parameter Estimate')
# Plot empirical and asymptotic standard errors
res.all %>%
filter(Type != 'Est') %>%
mutate(Label = paste(Model, '-', Type)) %>%
ggplot(aes(x = rho, y = value, color = Label, lty = Label)) +
geom_line(alpha = 0.7) +
scale_x_continuous(breaks = rho.list) +
scale_color_manual(values = rep(scales::hue_pal()(3), each = 2)) +
scale_linetype_manual(values = rep(c(1,2), times = 3)) +
facet_wrap(~Parameter, scales = 'free_y') +
theme_bw() +
theme(legend.position = 'bottom') +
labs(x = 'Rho', y = 'Average Standard Error', color = 'Model/Type', lty = 'Model/Type')