MI Gibbs sampler with 2 correlated covariates

This file reproduces the Multiple Imputation for left-censored and correlated data as described in (Lee et al., 2012).

Before we get started with implementing the method, we need some data to practice on.

Generate the data

Create a function to generate bivariate normal left censored data with a desired censoring rate cens.pct.

f.datGen.BVNleftcens <- function(nn, mu.1, mu.2, Sigma.y, cens.pct = c(4,4)){
  ## GENERATE LEFT-CENSORED BVN DATA --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- 
  YY <- MASS::mvrnorm(nn, c(0,0), Sigma.y) + cbind(mu.1, mu.2) # correlated biomarkers
  YY <- as.data.frame(YY)
  colnames(YY) <- c('y1','y2')
  colMeans(YY); cor(YY)
  
  #### LEFT CENSOR THE DATA ####
  
  ## set limits of detection
  # cens.pct <- c(4,4) # censoring percentage (i.e., get the 40th percentile)
  lod.1 <- quantile(YY$y1, 1:10/10)[cens.pct[1]] # censor at percentile
  lod.2 <- quantile(YY$y2, 1:10/10)[cens.pct[2]] # censor at percentile
  
  ## left censor data
  YY$y1.obs <- pmax(YY$y1, lod.1)
  YY$y2.obs <- pmax(YY$y2, lod.2)
  
  ## censoring indicator, 1 if censored (below lod), 0 if observed (above lod)
  YY$cens.1 <- 1*(YY$y1.obs==lod.1) 
  YY$cens.2 <- (1*(YY$y2.obs==lod.2)) 
  
  ## naive lod/2
  YY$y1.half.lod <- ifelse(YY$cens.1==1, lod.1/2, YY$y1)
  YY$y2.half.lod <- ifelse(YY$cens.2==1, lod.2/2, YY$y2)
  
  return(YY)
}

Define variables to generate the data from a bivariate normal distribution with means \(\mu_1 = 1, \mu_2=2\) with categorical covariate \(X_1\).

set.seed(10)
nn <- 300

## coefficient and covariate for biomarker
x1 <- sample(0:1, nn, T)
XX <- model.matrix(~ x1) # design matrix
Beta1 <- c(0.5,1) # produces mean 1
Beta2 <- c(1.5,1) # produces mean 2

## means of each biomarker
mu.1 <- XX %*% Beta1
mu.2 <- XX %*% Beta2
rho <- 0.5 # correlation
Sigma.y <- diag(1-rho, 2) + rho # covariance matrix
YY <- f.datGen.BVNleftcens(nn, mu.1, mu.2, Sigma.y, cens.pct = c(4,4))
round(head(YY, 10),3)

##        y1     y2 y1.obs y2.obs cens.1 cens.2 y1.half.lod y2.half.lod
## 1  -0.421  1.743  0.804  1.743      1      0       0.402       1.743
## 2   1.045  0.522  1.045  1.735      0      1       1.045       0.867
## 3   3.760  2.241  3.760  2.241      0      0       3.760       2.241
## 4   1.105  1.397  1.105  1.735      0      1       1.105       0.867
## 5   0.893  1.606  0.893  1.735      0      1       0.893       0.867
## 6  -1.292 -0.728  0.804  1.735      1      1       0.402       0.867
## 7   2.199  2.856  2.199  2.856      0      0       2.199       2.856
## 8   2.516  3.476  2.516  3.476      0      0       2.516       3.476
## 9  -0.982  0.905  0.804  1.735      1      1       0.402       0.867
## 10 -0.688 -0.048  0.804  1.735      1      1       0.402       0.867

Model Inputs and Hyperparameters

Specify necessary inputs for the Gibbs sampler: \[\Sigma \sim \mathrm{InvWishart}(\nu_0, S_0),\] \[\beta_j \sim MVN(\beta_{0j}, \Omega = \sigma^2_j B_0).\] That is, assign the covariance matrix an Inverse Wishart prior and the beta coefficients matrix normal where each column is multivariate normal.

## Hyperparameters for covariance matrix prior:
nu0.Sigma <- 4                 # df Inv-Wishart
S0.Sigma  <- diag(2) + 0.2     # scale matrix (weak correlation prior)

## Hyperparameters for regression coefficients:
beta1.0   <- c(0,0,0)          # prior mean for beta_y1
covB.1.0  <- diag(1,3)         # prior covariance for beta_y1
beta2.0   <- c(0,0,0)          # prior mean for beta_y2
covB.2.0  <- diag(1,3)         # prior covariance for beta_y2

## LODs
lod.1 <- min(YY$y1.half.lod * 2)
lod.2 <- min(YY$y2.half.lod * 2)

The outcome of interest \(Z\) is generated from a logistic regression based on the true biomarker values: \[ \text{logit}\{\Pr(Z=1)\} = b_0 + b_1 y_1 + b_2 y_2. \]

## Set the true b coefficients for outcome of interest
bb <- c(-0.1, -0.2, 0.3)

design.mat.Y <- model.matrix(~ y1 + y2, data = YY)
logit.p <- design.mat.Y %*% bb # logit(Pr(Y=1)), logit model describes linear relationship
pp <- 1/(1+exp(-logit.p))
nn <- nrow(YY)
zz <- rbinom(nn, 1, pp) # outcome of interest

Also, previous research shows that all available information, i.e., including the outcome, should be used when imputing missing values.

## update design matrix so imputation accounts for the outcome of interest.
XXz <- cbind(XX,zz)

Initialise sampler

Begin with sampling the covariance matrix from inverse-wishart \((\nu_0, S_0)\) and extract the variance components and correlation

## covariance matrix and extract components.
Sig.init <- MCMCpack::riwish(nu0.Sigma, S0.Sigma)
sig2.y1.init <- as.numeric(Sig.init[1,1])
sig2.y2.init <- as.numeric(Sig.init[2,2])
rho.init <- as.numeric(Sig.init[1,2] / sqrt(sig2.y1.init*sig2.y2.init))

Recall that the mean, conditioned on the variance, is multivariate normal with covariance matrix \(\Omega\). We compute this and then sample the prior MVN random variables for our initial mean vectors for each biomarker.

## prior covariance matrix for the prior mean vector.
Omega1.0 <- sig2.y1.init * covB.1.0
Omega2.0 <- sig2.y2.init * covB.2.0

## beta1 for y1
beta.y1.init <- MASS::mvrnorm(n=1, mu = beta1.0, Sigma = Omega1.0)

## beta2 for y2
beta.y2.init <- MASS::mvrnorm(n=1, mu = beta2.0, Sigma = Omega2.0)

We are almost ready to run the Gibbs sampler; set up matrices to keep track of each iteration, since we want to extract \(M\) datasets to perform MI analysis.

## num iterations
n.mcmc <- 5000
## set up samples dataframe and biomarker matrix to keep track of each loop
yy1.complete <- YY$y1.obs
yy2.complete <- YY$y2.obs
dat.length <- length(yy2.complete)

## data matrix for imputated y1 and y2 biomarkers
yy1.dat <- yy2.dat <- matrix(NA, nrow = dat.length, ncol = n.mcmc + 1)
yy1.dat[,1] <- yy1.complete # observed data is initial
yy2.dat[,1] <- yy2.complete # observed data is initial

## parameter matrix for updated params
samples <- data.frame(
  iteration = 0:n.mcmc,
  y1.beta1 = NA,
  y1.beta2 = NA,
  y1.beta3 = NA,
  y1.sigma2 = NA,
  y2.beta1 = NA,
  y2.beta2 = NA,
  y2.beta3 = NA,
  y2.sigma2 = NA,
  rho = NA
)
samples[1,2:10] <- c(beta.y1.init[1], beta.y1.init[2], beta.y1.init[3], sig2.y1.init,
                     beta.y2.init[1], beta.y2.init[2], beta.y2.init[3], sig2.y2.init, rho.init) # initials

## initialise params for r-th iteration
beta.y1.r <- beta.y1.init
beta.y2.r <- beta.y2.init

##
Sigma.r <- Sig.init
sig2.y1.r <- sig2.y1.init
sig2.y2.r <- sig2.y2.init
rho.r <- rho.init

The last piece of bookkeeping involves the censoring patterns; get the index for which biomarkers to impute based on the censoring pattern so the Gibbs sampler knows what to do.

## index of censored biomarkers to impute, based on censoring pattern
#' imputed censored values via conditional normal distribution
#' pattern 1 := p1. both y1,y2 observed. (do nothing)
#' pattern 2 := p2. y1 observed, y2 censored. 
#' pattern 3 := p3. y1 censored, y2 observed.
#' pattern 4 := p4. both y1,y2 censored.
cens.idx.p1 <- which(YY$cens.1==0 & YY$cens.2==0)
cens.idx.p2 <- which(YY$cens.1==0 & YY$cens.2==1)
cens.idx.p3 <- which(YY$cens.1==1 & YY$cens.2==0)
cens.idx.p4 <- which(YY$cens.1==1 & YY$cens.2==1)

Gibbs sampler

The sampling algorithm is as follows.

Initialize \[ \theta^{(0)} = (\beta_1^{(0)}, \beta_2^{(0)}, \sigma_{1|2}^{2(0)}, \sigma_{2|1}^{2(0)}, \rho^{(0)}). \] with the maximum likelihood estimates of a Tobit model.
Imputation step: Given \(\theta^{(r)}\) at the \(r\)th iteration, generate the censored observations for subject \(i\) by successively sampling from the following: \[ Y_{i1}^{*(r+1)} \sim \text{TruncNormal}\big(\mu_{1|2} = X_i \beta_1^{(r)} \,\big|\, \sigma_{1|2}^{2(r)} \big), \] \[ Y_{i2}^{*(r+1)} \sim \text{TruncNormal}\big(\mu_{2|1} = X_i \beta_2^{(r)} \,\big|\, \sigma_{2|1}^{2(r)} \big). \]
Posterior step: Sample \[ \Sigma^{(r+1)} \sim f\big(\Sigma \mid Y_{C'}, Y_C^{*(r+1)}, \mu^{(r)}\big), \] and then sample \[ \mu^{(r+1)} \sim f\big(\beta \mid Y_{C'}, Y_C^{*(r+1)}, \Sigma^{(r+1)}\big). \]
Repeat steps 2 and 3 until the algorithm converges.

## begin loop
  for(i in 1:n.mcmc){
    
    ## Impute censored biomarkers, sample from truncated normal --- --- --- --- --- --- --- --- --- --- --- --- 
    y1.XB.r <- as.numeric(XXz %*% beta.y1.r)
    y2.XB.r <- as.numeric(XXz %*% beta.y2.r)
    
    #### PATTERN 2 : y1 observed, y2 censored. Impute y2 based conditional (truncated) normal dist #### 
    if(length(cens.idx.p2) > 0) { 
    mean.p2 <- y2.XB.r[cens.idx.p2] + (rho.r * sqrt(sig2.y2.r / sig2.y1.r)) * (yy1.complete[cens.idx.p2] - y1.XB.r[cens.idx.p2]) 
    var.p2 <- sig2.y2.r * (1- rho.r^2)
    yy2.imputed.p2 <- bayesm::rtrun(mean.p2, 
                                    rep(sqrt(var.p2), length(cens.idx.p2)), 
                                    a = c(rep(-1e8, length(cens.idx.p2))), 
                                    b = c(rep(lod.2, length(cens.idx.p2))))
    yy2.complete[cens.idx.p2] <- yy2.imputed.p2
    }
    
    #### PATTERN 3 : y1 censored, y2 observed. Impute y1 based conditional (truncated) normal dist #### 
    if(length(cens.idx.p3) > 0) { 
    mean.p3 <- y1.XB.r[cens.idx.p3] + (rho.r * sqrt(sig2.y1.r / sig2.y2.r)) * (yy2.complete[cens.idx.p3] - y2.XB.r[cens.idx.p3]) 
    var.p3 <- sig2.y1.r * (1- rho.r^2)
    yy1.imputed.p3 <- bayesm::rtrun(mean.p3, 
                                    rep(sqrt(var.p3), length(cens.idx.p3)), 
                                    a = c(rep(-1e8, length(cens.idx.p3))), 
                                    b = c(rep(lod.1, length(cens.idx.p3))))
    yy1.complete[cens.idx.p3] <- yy1.imputed.p3
    }
    
    #### PATTERN 4 : both y1 and y2 are censored. Impute one at a time using previous value. #### 
    if(length(cens.idx.p4) > 0) { 
    ## first, impute y1 conditional on previous y2.
    mean.y1.p4 <- y1.XB.r[cens.idx.p4] + (rho.r * sqrt(sig2.y1.r / sig2.y2.r)) * (yy2.complete[cens.idx.p4] - y2.XB.r[cens.idx.p4]) 
    var.y1.p4 <- sig2.y1.r * (1- rho.r^2)
    yy1.imputed.p4 <- bayesm::rtrun(mean.y1.p4, 
                                    rep(sqrt(var.y1.p4), length(cens.idx.p4)), 
                                    a = c(rep(-1e8, length(cens.idx.p4))), 
                                    b = c(rep(lod.1, length(cens.idx.p4))))
    yy1.complete[cens.idx.p4] <- yy1.imputed.p4
    
    ## second, impute y2 conditional on newly imputed y1.
    mean.y2.p4 <- y2.XB.r[cens.idx.p4] + (rho.r * sqrt(sig2.y2.r / sig2.y1.r)) * (yy1.complete[cens.idx.p4] - y1.XB.r[cens.idx.p4]) 
    var.y2.p4 <- sig2.y2.r * (1- rho.r^2)
    yy2.imputed.p4 <- bayesm::rtrun(mean.y2.p4, 
                                    rep(sqrt(var.y2.p4), length(cens.idx.p4)), 
                                    a = c(rep(-1e8, length(cens.idx.p4))), 
                                    b = c(rep(lod.2, length(cens.idx.p4))))
    yy2.complete[cens.idx.p4] <- yy2.imputed.p4
    }
    
    ## PARAMETER UPDATES --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- 
    
    ## update covariance matrix Sigma from inverse wishart
    updated.df <- nu0.Sigma + dat.length
    residual.mat <- cbind(yy1.complete - y1.XB.r, yy2.complete - y2.XB.r) # matrix of (Y - theta)
    updated.scale.mat <- S0.Sigma + crossprod(residual.mat) # (Y-theta)^T %*% (Y-theta)
    Sigma.r <- MCMCpack::riwish(updated.df, updated.scale.mat) 
    
    ## update variance params and correlation from the covariance matrix
    sig2.y1.r <- as.numeric(Sigma.r[1,1])
    sig2.y2.r <- as.numeric(Sigma.r[2,2])
    rho.r <- as.numeric(Sigma.r[1,2] / sqrt(sig2.y1.r*sig2.y2.r))

    ## update parameters: sample beta for y1
    updated.y1.cov.r <- solve((crossprod(XXz,XXz)/sig2.y1.r) + solve(Omega1.0))
    updated.y1.mean.r <- updated.y1.cov.r %*% ((crossprod(XXz,yy1.complete) / sig2.y1.r) + (solve(Omega1.0) %*% beta1.0))
    beta.y1.r <- MASS::mvrnorm(n=1, mu = updated.y1.mean.r, 
                               Sigma = updated.y1.cov.r)

    ## update parameters: sample beta for y2 
    updated.y2.cov.r <- solve((crossprod(XXz,XXz)/sig2.y2.r) + solve(Omega2.0))
    updated.y2.mean.r <- updated.y2.cov.r %*% ((crossprod(XXz,yy2.complete) / sig2.y2.r) + (solve(Omega2.0) %*% beta2.0))
    beta.y2.r <- MASS::mvrnorm(n=1, mu = updated.y2.mean.r, 
                             Sigma = updated.y2.cov.r)
    
    ## update samples matrix and imputed dataset --- --- --- --- --- --- --- --- --- --- --- --- 
    yy1.dat[,i+1] <- yy1.complete
    yy2.dat[,i+1] <- yy2.complete
    samples[i+1, 2:10] <- c(beta.y1.r[1], beta.y1.r[2], beta.y1.r[3], sig2.y1.r,
                           beta.y2.r[1], beta.y2.r[2], beta.y2.r[3], sig2.y2.r, rho.r)
  } # END OF GIBBS SAMPLER LOOP --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

Diagnostics

Traceplots look ok.

# traceplots 
 plot(samples$y1.beta1, type = 'l', col = 'red', 
      main = 'Traceplots for y1 params', xlab = 'Iteration', ylab = 'b1', ylim = c(0,1.5))
 lines(samples$y1.beta2, type = 'l', col = 'blue')
 lines(samples$y1.sigma2, type='l', col= 'pink')

 ## plot the posterior density
 foo.df <- with(samples, data.frame(
   value = c(y1.beta1, y1.beta2, y1.sigma2),
   parameter = rep(c("beta1", "beta2", "sigma^2"), 
                   each = length(iteration))
 ))
 ggplot(foo.df, aes(x = value, color = parameter, fill = parameter)) +
   geom_density(alpha = 0.3) +   # transparency
   labs(title = "Density of params",
        x = "Value", y = "Density") +
   theme_minimal()

Multiple imputation analysis

Following Little and Rubin (2015), perform MI analysis. Let \(b^{(m)}\) be the estimate of \(b\) from the \(m\)-th complete dataset \(m=1,...,M\), then following the MI procedure in , the estimate for \(b\) is \[ \hat{b} = \frac{1}{M} \sum_{m=1}^{M} b^{(m)} \] with variance \[ V(\hat{b}) = \frac{1+M}{M}S^2_M + \frac{1}{M} \sum_{m=1}^{M} V_m, \] where \(S_M^2\) is the sample variance of \(\hat{b}^{(m)}\).

In this case, we let \(M=5\) and take datasets \(1000,2000,3000,4000\), and \(5000\) and we compare to the naive and omniscient estimates.

MI.coef <- sapply(seq(from=1000, to=5000, length.out=5), 
                    FUN = function(jj){
                      y1.tmp <- yy1.dat[,jj]
                      y2.tmp <- yy2.dat[,jj]
                      fit <- glm(zz ~ y1.tmp + y2.tmp, data = YY, family = binomial)
                      c(summary(fit)$coef[1:3,1:2])
                    })
  rownames(MI.coef) <- c('mu.b0', 'mu.b1', 'mu.b2', 'se.b0', 'se.b1', 'se.b2')
  mi.betas <- MI.coef[1:3,]
  mi.se <- MI.coef[4:6,]
  
  ## MI mean
  pooled.beta <- rowMeans(mi.betas)
  
  ## MI variance (Little and Rubin 2015 pp.234)
  within.var <- rowMeans(mi.se^2)
  between.var <- apply(mi.betas, 1, var) # (1/(ncol(mi.betas)-1))*sum((mi.betas[3,]- mean(mi.betas[3,]))^2)
  DD <- ncol(mi.se)
  pooled.se <- sqrt(within.var + (1 + 1/DD)*between.var)
  
  ## compute confidence intervals for coverage probability: t-based 95% CI (from Little and Rubin).
  deg.freedom <- (DD-1) * (1 + (DD/(DD+1)) * within.var/between.var)^2
  ci.lower.mi <- pooled.beta - qt(0.975, df = deg.freedom) * pooled.se
  ci.upper.mi <- pooled.beta + qt(0.975, df = deg.freedom) * pooled.se
  coverage.mi <- (bb >= ci.lower.mi) & (bb <= ci.upper.mi)
  
  #### OMNISCIENT #### --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- 
  true.fit <- glm(zz ~ y1 + y2, data = YY, family = binomial)
  ci.lower.omni <- coef(true.fit) - qnorm(0.975) * sqrt(diag(vcov(true.fit)))
  ci.upper.omni <- coef(true.fit) + qnorm(0.975) * sqrt(diag(vcov(true.fit)))
  coverage.omni <- (bb >= ci.lower.omni) & (bb <= ci.upper.omni)
  
  #### NAIVE LOD/2 #### --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- --- 
  naive.fit <- glm(zz ~ y1.half.lod + y2.half.lod, data = YY, family = binomial)
  ci.lower.naive <- coef(naive.fit) - qnorm(0.975) * sqrt(diag(vcov(naive.fit)))
  ci.upper.naive <- coef(naive.fit) + qnorm(0.975) * sqrt(diag(vcov(naive.fit)))
  coverage.naive <- (bb >= ci.lower.naive) & (bb <= ci.upper.naive)

Results

View and compare results: