Inverse Probability of Sampling Weights (IPSW)

Simulated Flu Data

data <- readRDS("~/Documents/EPID 799 R/2018/flu_data.rds")
library(tidyverse) 

#Take a look at a subset of observations
data %>% 
  dplyr::select(age_group, sex_f, health_seek_claims, comorbid, flu_vax, flu_inf, died) %>% 
  tail(n=10) 
##       age_group  sex_f health_seek_claims comorbid flu_vax flu_inf died
## 19991       85+ Female                  0        1       0       0    0
## 19992     80-84   Male                  1        1       1       0    0
## 19993     80-84   Male                  1        0       0       0    0
## 19994     80-84   Male                  0        1       0       0    0
## 19995     80-84 Female                  0        1       1       0    0
## 19996     80-84 Female                  1        1       1       0    0
## 19997       85+   Male                  1        1       0       0    0
## 19998     75-79 Female                  1        0       0       0    0
## 19999     80-84 Female                  1        1       1       0    0
## 20000     65-69   Male                  0        0       1       0    0

Exposure and Outcome Variables

#what is the number and percent of participants with influenza vaccination?
data %>% 
  dplyr::group_by(flu_vax) %>% 
  dplyr::summarise(number=n(), percent=n()/nrow(data)*100)
## # A tibble: 2 x 3
##   flu_vax number percent
##     <int>  <int>   <dbl>
## 1       0   9567    47.8
## 2       1  10433    52.2
#what is the number and percent of participants with influenza infection?
data %>% 
  dplyr::group_by(flu_inf) %>% 
  dplyr::summarise(number=n(), percent=n()/nrow(data)*100)
## # A tibble: 2 x 3
##   flu_inf number percent
##     <int>  <int>   <dbl>
## 1       0  18885   94.4 
## 2       1   1115    5.58

Age Distributions

#Number and percent of study participants in each age group
observed <- data %>% 
  dplyr::group_by(age_group) %>% 
  dplyr::summarise(size_observed=n(), percent_observed=n()/nrow(data)*100)

#Study population
observed
## # A tibble: 5 x 3
##   age_group size_observed percent_observed
##   <fct>             <int>            <dbl>
## 1 65-69              4000             20.0
## 2 70-74              4018             20.1
## 3 75-79              4000             20.0
## 4 80-84              3912             19.6
## 5 85+                4070             20.3
#US population size for each age group based on census data
age <- data.frame(age_group=c("65-69","70-74","75-79","80-84","85+"),
                  size_target=c(16820083,11810247,8367895,5865639,6380331))

#Calculating the percent of the older adult population in each age group
age$percent_target <- age$size_target/sum(age$size_target) *100

#Adding variables for age groups in our study population
age <- left_join(age, observed, by="age_group") 

#All age data
age
##   age_group size_target percent_target size_observed percent_observed
## 1     65-69    16820083       34.15648          4000            20.00
## 2     70-74    11810247       23.98302          4018            20.09
## 3     75-79     8367895       16.99265          4000            20.00
## 4     80-84     5865639       11.91133          3912            19.56
## 5       85+     6380331       12.95651          4070            20.35
#Going from wide to long data for plotting
a_long <- gather(age, key = key, value = value, percent_target, percent_observed)

#Adding labels
a_long$key_f <- factor(a_long$key, 
                       levels = c("percent_observed", "percent_target"),
                       labels = c("Observed", "Target"))

#Plot the percentages of the target population and the study population in each age group
ggplot(data=a_long, aes(x=age_group, y=value, fill=key_f))+
  geom_bar(stat = "identity",position = "dodge",width = .3)+
  theme_classic()+
  xlab("Age group")+
  ylab("Percent of population")+
  theme(legend.title = element_blank(),legend.position = c(.9,.9))

Exposure and Outcome by Age

We observed that the prevalence of flu vaccination in our study population was 52%, and the incidence of flu infection was 5.6%. In order to infer these estimates to the target population, do we need to account for the differing age distributions?

#Vaccination and infection by age group
data %>%
  dplyr::group_by(age_group) %>%
  dplyr::summarise(percent_vaccination=sum(flu_vax)/n(),
            percent_infection=sum(flu_inf)/n())
## # A tibble: 5 x 3
##   age_group percent_vaccination percent_infection
##   <fct>                   <dbl>             <dbl>
## 1 65-69                   0.794            0.0272
## 2 70-74                   0.735            0.0356
## 3 75-79                   0.508            0.0525
## 4 80-84                   0.359            0.0713
## 5 85+                     0.212            0.0919

Create Sampling Weights

To account for the different age distribution of our study population, we can calculate the sampling fraction for each participant based on their age group. The sampling weight for each participant is then calculated as 1/sampling fraction.

#In our dataset, adding variables for sizes of observed age group and target age group 
age <- age %>% select(age_group, size_target, size_observed)
data <- left_join(data, age, by="age_group")

#Then calculate the sampling fraction
data$sample_fraction <- data$size_observed / data$size_target  

#The sampling weight is 1/sampling fraction
data$sample_weight <- 1 / data$sample_fraction

#These are the 5 values for sampling fraction and weight in our data, based on the 5 age groups
data %>% 
  dplyr::group_by(age_group) %>%
  dplyr::summarise(sample_fraction=unique(sample_fraction), sample_weight=unique(sample_weight))
## # A tibble: 5 x 3
##   age_group sample_fraction sample_weight
##   <fct>               <dbl>         <dbl>
## 1 65-69            0.000238          4205
## 2 70-74            0.000340          2939
## 3 75-79            0.000478          2092
## 4 80-84            0.000667          1499
## 5 85+              0.000638          1568

Apply Sampling Weights

The weighted prevalence of vaccination (for the target population) is 60% (compared to 52% in the study population).

#What is the weighted prevalence estimate?
data %>%
  dplyr::group_by(flu_vax) %>%
  dplyr::summarise(n=n(), n_weight=sum(sample_weight)) %>%
  dplyr::mutate(prev_unweight=n/sum(n), prev_weight=n_weight/sum(n_weight))
## # A tibble: 2 x 5
##   flu_vax     n n_weight prev_unweight prev_weight
##     <int> <int>    <dbl>         <dbl>       <dbl>
## 1       0  9567 19486108         0.478       0.396
## 2       1 10433 29758087         0.522       0.604

The weighted incidence of infection (for the target population) is 4.7% (compared to 5.6% in the study population).

#What is the weighted incidence estimate?
data %>%
  dplyr::group_by(flu_inf) %>%
  dplyr::summarise(n=n(), n_weight=sum(sample_weight)) %>%
  dplyr::mutate(prev_unweight=n/sum(n), prev_weight=n_weight/sum(n_weight))
## # A tibble: 2 x 5
##   flu_inf     n n_weight prev_unweight prev_weight
##     <int> <int>    <dbl>         <dbl>       <dbl>
## 1       0 18885 46921576        0.944       0.953 
## 2       1  1115  2322619        0.0558      0.0472

Inverse Probability of Treatment Weights (IPTW)

Births Data

births <- readRDS("~/Documents/EPID 799 R/2018/R for epi 2018 data pack/birth_cohort.rds")
library(broom)
library(tableone)
library(dagitty)
library(ggdag)
library(ROCR)
library(plotROC)
library(survey)
library(geepack)

#Subset to variables of interest for our model, exclude missing
df <- births %>% 
  dplyr::select(x, wksgest, preterm, preterm_f, pnc5, pnc5_f, mage, raceeth_f, smoker_f) %>%
  dplyr::filter(!is.na(preterm_f) & !is.na(pnc5_f) & !is.na(mage) & !is.na(raceeth_f) & !is.na(smoker_f))

Potential Confounders

dag <- dagitty::downloadGraph("dagitty.net/mAqKBcR")  
ggdag::ggdag_adjustment_set(dag) + 
  theme_dag_blank() + 
  geom_dag_text(color="black")

tableone::CreateTableOne(vars = c("mage","raceeth_f","smoker_f"), 
               strata = "pnc5_f", test = F,
               data = df)
##                        Stratified by pnc5_f
##                         No Early PNC  Early PNC    
##   n                      5433         56064        
##   mage (mean (sd))      26.10 (6.11)  27.65 (5.90) 
##   raceeth_f (%)                                    
##      AA                  1834 (33.8)  12763 (22.8) 
##      AI or AN              86 ( 1.6)    782 ( 1.4) 
##      Other               1238 (22.8)   8619 (15.4) 
##      WH                   175 ( 3.2)   1387 ( 2.5) 
##      WnH                 2100 (38.7)  32513 (58.0) 
##   smoker_f = Smoker (%)   898 (16.5)   5646 (10.1)

Propensity Score Model

#Probability of early prenatal care, as a function of maternal age, race/ethnicity, and smoking status
summary(ps_model <- glm(data = df, pnc5_f ~ poly(mage, 2) + raceeth_f + smoker_f, family=binomial("logit")))
## 
## Call:
## glm(formula = pnc5_f ~ poly(mage, 2) + raceeth_f + smoker_f, 
##     family = binomial("logit"), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5008   0.3064   0.3758   0.4796   0.9484  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         2.11585    0.02658  79.610  < 2e-16 ***
## poly(mage, 2)1     47.37029    3.57311  13.257  < 2e-16 ***
## poly(mage, 2)2    -27.67095    3.42022  -8.090 5.95e-16 ***
## raceeth_fAI or AN   0.43583    0.11767   3.704 0.000212 ***
## raceeth_fOther     -0.16177    0.04023  -4.021 5.79e-05 ***
## raceeth_fWH         0.02710    0.08460   0.320 0.748720    
## raceeth_fWnH        0.75215    0.03439  21.871  < 2e-16 ***
## smoker_fSmoker     -0.71256    0.04094 -17.407  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 36738  on 61496  degrees of freedom
## Residual deviance: 35404  on 61489  degrees of freedom
## AIC: 35420
## 
## Number of Fisher Scoring iterations: 5
#Use the model to predict the probability of early prenatal care (propensity score) for each observation
df$ps <- predict(ps_model, type = "response")

#Distribution of propensity score...
summary(df$ps[df$pnc5==1]) #...among mothers with observed early prenatal care
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6378  0.8901  0.9151  0.9136  0.9521  0.9562
summary(df$ps[df$pnc5==0]) #...among mothers with no observed early prenatal care
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6949  0.8656  0.8953  0.8912  0.9270  0.9562
#Plot propensity score, by observed prenatal care
ggplot(data=df,aes(x=ps, group=factor(pnc5_f), fill=factor(pnc5_f)))+
  geom_histogram(aes(y=..density..),alpha = 0.75,binwidth=0.02,position = position_dodge(width=0.01))+
  theme_classic()+
  xlab("Predicted probability of early prenatal care")+
  labs(fill = "Observed")

#Function to calculate the AUC and MSPE (measures of predictive ability)
pred.eval <- function(preds,outcome)
{
  pred.sum=prediction(preds,outcome)
  auc=as.numeric(performance(pred.sum,"auc")@y.values)
  mspe=mean((preds-outcome)^2)
  list(auc=auc,mspe=mspe)
}
pred.eval(preds = df$ps, outcome = df$pnc5)
## $auc
## [1] 0.6461399
## 
## $mspe
## [1] 0.07879287

Use Propensity Score for IPTW

#Use propensity score to calculate unstabilized IPTW
df$iptw_u <- ifelse(df$pnc5==1, 1/df$ps, 1/(1-df$ps)) 

#Distribution of weights in the exposed
summary(df$iptw_u[df$pnc5==1]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.046   1.050   1.093   1.097   1.124   1.568
#Distribution of weights in the unexposed
summary(df$iptw_u[df$pnc5==0]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.277   7.442   9.554  11.222  13.696  22.806
#Distribution of weights in the total population
summary(df$iptw_u) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.046   1.051   1.098   1.992   1.139  22.806
#Used marginal probability of exposure (early prenatal care) to stabilize weights
p_exposure <- sum(df$pnc5) / nrow(df)
df$iptw_s <- ifelse(df$pnc5==1, p_exposure/df$ps, (1-p_exposure)/(1-df$ps))

#Distribution of weights in the exposed
summary(df$iptw_s[df$pnc5==1]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.9535  0.9575  0.9962  1.0001  1.0243  1.4294
#Distribution of weights in the unexposed
summary(df$iptw_s[df$pnc5==0]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2895  0.6575  0.8441  0.9914  1.2100  2.0148
#Distribution of weights in the total population
summary(df$iptw_s) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2895  0.9566  0.9851  0.9993  1.0257  2.0148

Reweighted Population

weighted_df <- svydesign(~0, weights = df$iptw_s, data=df)

svyCreateTableOne(vars = c("mage","raceeth_f","smoker_f"), 
                  strata = "pnc5_f", test = F, 
                  data = weighted_df)
##                        Stratified by pnc5_f
##                         No Early PNC    Early PNC       
##   n                     5386.47         56069.25        
##   mage (mean (sd))        27.36 (5.96)     27.51 (5.93) 
##   raceeth_f (%)                                         
##      AA                  1293.7 (24.0)   13309.1 (23.7) 
##      AI or AN              79.8 ( 1.5)     791.8 ( 1.4) 
##      Other                879.2 (16.3)    8988.9 (16.0) 
##      WH                   138.6 ( 2.6)    1424.2 ( 2.5) 
##      WnH                 2995.2 (55.6)   31555.2 (56.3) 
##   smoker_f = Smoker (%)   613.0 (11.4)    5975.3 (10.7)
ggplot(data=df, aes(x=ps, weight=iptw_s, group=factor(pnc5), fill=factor(pnc5)))+
  geom_histogram(aes(y=..density..),alpha = 0.75,binwidth=0.02,position = position_dodge(width=0.01))+
  theme_classic()+
  xlab("Predicted probability of early prenatal care")+
  labs(fill = "Observed")

Weighted Model

broom::tidy(geeglm(data = df, preterm ~ pnc5, family=binomial("identity"), weight=iptw_s, id = x))
##          term    estimate   std.error statistic      p.value
## 1 (Intercept)  0.11939288 0.004730875 636.90384 0.000000e+00
## 2        pnc5 -0.02411346 0.004892532  24.29136 8.280979e-07
#Compare to adjusted model
broom::tidy(glm(data = df, preterm ~ pnc5 + poly(mage, 2) + raceeth_f + smoker_f, family=binomial("identity")))
##                term     estimate   std.error   statistic       p.value
## 1       (Intercept)  0.153139641 0.005063951  30.2411367 6.823307e-201
## 2              pnc5 -0.022394044 0.004637421  -4.8289870  1.372294e-06
## 3    poly(mage, 2)1  0.468630828 0.309838748   1.5124991  1.304069e-01
## 4    poly(mage, 2)2  2.516000907 0.313959366   8.0137788  1.112367e-15
## 5 raceeth_fAI or AN -0.007913101 0.011861707  -0.6671131  5.046999e-01
## 6    raceeth_fOther -0.037644700 0.004133554  -9.1071033  8.461084e-20
## 7       raceeth_fWH -0.055691625 0.007342586  -7.5847430  3.331469e-14
## 8      raceeth_fWnH -0.055744896 0.003236356 -17.2245897  1.736549e-66
## 9    smoker_fSmoker  0.033199906 0.004262331   7.7891435  6.746493e-15

Marginal Structural Models

Time-Varying Depression

library(networkD3)
library(magrittr)
library(reshape2)
load("~/Documents/Dissertation/nodes.Rda")
load("~/Documents/Dissertation/links.Rda")
a <- readRDS("~/Documents/Dissertation/data_for_msm_long.rds")

#Time-varying depressive symptoms
networkD3::sankeyNetwork(Links = links, Nodes = nodes, Source = "source",
              Target = "target", Value = "value", NodeID ="name",
              fontSize = 16, fontFamily = "Arial", nodeWidth = 20,nodePadding = 5)

Time-Fixed and Time-Varying Confounding

  • Time-fixed: Baseline values of age, intervention arm, marital status, employment, new diagnosis/baseline ART use, history of overdose, alcohol use.
  • Time-varying: Depression, risk behaviors, and CD4 cell count at last visit.
tableone::CreateTableOne(vars = c("knowStatus_art", "arm_f", "marital_f", "employ_f", "age_bl", "history_od", "drink_f", "anysex_nocondom_3m", "injectrisk_any"), 
               strata = "cesd23", test = F,
               data = a[a$visitnum==1,])
##                                 Stratified by cesd23
##                                  0             1            
##   n                                253           201        
##   knowStatus_art (%)                                        
##      New HIV+                      194 (76.7)    141 (70.1) 
##      Know HIV+/No ART               27 (10.7)     41 (20.4) 
##      Know HIV+/On ART               32 (12.6)     19 ( 9.5) 
##   arm_f (%)                                                 
##      Control                        48 (19.0)     41 (20.4) 
##      Subdistrict                    80 (31.6)     59 (29.4) 
##      Individual                     54 (21.3)     41 (20.4) 
##      Both                           71 (28.1)     60 (29.9) 
##   marital_f (%)                                             
##      Single                         83 (32.8)     92 (45.8) 
##      Married/cohabitating          141 (55.7)     73 (36.3) 
##      Widow/Divorce/Sep              29 (11.5)     36 (17.9) 
##   employ_f (%)                                              
##      Work full-time                183 (72.3)    131 (65.2) 
##      Work part-time                 42 (16.6)     40 (19.9) 
##      Unemployed/other               28 (11.1)     30 (14.9) 
##   age_bl (mean (sd))             35.26 (6.20)  35.10 (6.40) 
##   history_od (mean (sd))          0.14 (0.35)   0.24 (0.43) 
##   drink_f = Alcohol use (%)        199 (78.7)    108 (53.7) 
##   anysex_nocondom_3m (mean (sd))  0.27 (0.44)   0.19 (0.40) 
##   injectrisk_any (mean (sd))      0.66 (0.47)   0.81 (0.39)

Model specifications. Logistic model for the probability of depressive symptoms (D) at each visit, as a of time-fixed confounders at baseline (C) and time-varying depressive symptoms (D), risk behavior (Y), and CD4 count (Z) at the last visit.

Propensity Score Model Outcome Model
D(t) ~ C + Z(t-1) + Y(t-1) + D(t-1) Y(t+1) ~ D(t)

Crude Effect of Depression on Risk Behavior

#Note for the purposes of this example, I'm just doing a complete case analysis. However, there is non-negligible missing data, and if not properly addressed, could bias these estimates. Two approaches to deal with this (that I'm currently working on!) are to combine the IPTW with IPOW (missingness weights), or to use multiple imputation and perform the IPTW analysis on the imputed data.
complete <- a %>% 
  dplyr::filter(!is.na(last_cd4)
         & !is.na(last_cesd23)
         & !is.na(last_injectrisk)
         & !is.na(cesd23)
         & !is.na(next_injectrisk))

complete %>% 
  dplyr::group_by(cesd23) %>% 
  dplyr::summarise(n=n(), outcome=sum(next_injectrisk, na.rm=T)) %>%
  dplyr::mutate(risk=outcome/n)
## # A tibble: 2 x 4
##   cesd23     n outcome   risk
##    <dbl> <int>   <dbl>  <dbl>
## 1   0      523    29.0 0.0554
## 2   1.00   293    34.0 0.116
broom::tidy(glm(next_injectrisk ~ cesd23, family = binomial(link = "identity"), data = complete))
##          term   estimate  std.error statistic      p.value
## 1 (Intercept) 0.05544933 0.01000714  5.540977 3.007883e-08
## 2      cesd23 0.06059162 0.02121862  2.855588 4.295724e-03
broom::tidy(glm(next_injectrisk ~ cesd23, family = binomial(link = "log"), data = complete), exponentiate = T)
##          term   estimate std.error  statistic      p.value
## 1 (Intercept) 0.05544933 0.1804678 -16.026598 8.332129e-58
## 2      cesd23 2.09273861 0.2420068   3.051457 2.277334e-03

Weighted Effect of Depression on Risk Behaviors

#Probability of depression as a function of last values of depression, CD4 count, and risk behaviors (time-varying), and baseline values of time-fixed confounders
summary(depression_model <- glm(cesd23 ~ last_cesd23 + last_cd4 + last_injectrisk + arm_f + marital_f + employ_f + knowStatus_art + age_bl + history_od + drink_f, family=binomial("logit"), data = complete))
## 
## Call:
## glm(formula = cesd23 ~ last_cesd23 + last_cd4 + last_injectrisk + 
##     arm_f + marital_f + employ_f + knowStatus_art + age_bl + 
##     history_od + drink_f, family = binomial("logit"), data = complete)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8771  -0.8074  -0.5637   0.9524   2.2922  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -1.42422    0.54424  -2.617  0.00887 ** 
## last_cesd23                     1.51168    0.16636   9.087  < 2e-16 ***
## last_cd4200<=CD4<500           -0.07543    0.17907  -0.421  0.67358    
## last_cd4CD4>=500               -0.14918    0.27421  -0.544  0.58641    
## last_injectrisk                 0.74803    0.16746   4.467 7.93e-06 ***
## arm_fSubdistrict               -0.34704    0.23787  -1.459  0.14458    
## arm_fIndividual                -0.67149    0.27758  -2.419  0.01556 *  
## arm_fBoth                      -0.23792    0.22981  -1.035  0.30053    
## marital_fMarried/cohabitating  -0.34408    0.19604  -1.755  0.07923 .  
## marital_fWidow/Divorce/Sep     -0.46262    0.29711  -1.557  0.11945    
## employ_fWork part-time          0.01070    0.23151   0.046  0.96312    
## employ_fUnemployed/other        0.41752    0.26149   1.597  0.11033    
## knowStatus_artKnow HIV+/No ART -0.21729    0.25537  -0.851  0.39484    
## knowStatus_artKnow HIV+/On ART -0.51781    0.24470  -2.116  0.03434 *  
## age_bl                          0.01949    0.01419   1.374  0.16952    
## history_od                      0.09042    0.21663   0.417  0.67638    
## drink_fAlcohol use             -0.25299    0.19280  -1.312  0.18945    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1065.50  on 815  degrees of freedom
## Residual deviance:  900.98  on 799  degrees of freedom
## AIC: 934.98
## 
## Number of Fisher Scoring iterations: 4
#Use the model to predict the probability of depression (propensity score) for each observation
complete$predict <- predict(depression_model, type = "response")

#Plot propensity score, by observed depression
ggplot(data=complete,aes(x=predict, group=factor(cesd23), fill=factor(cesd23)))+
  geom_histogram(aes(y=..density..),alpha = 0.75,binwidth=0.02,position = position_dodge(width=0.01))+
  theme_classic()+
  xlab("Predicted probability of CES-D≥23")+
  labs(fill = "CES-D≥23")

pred.eval(complete$predict, complete$cesd23)
## $auc
## [1] 0.7595097
## 
## $mspe
## [1] 0.1851221
#Used propensity score to calculate unstabilized inverse probability of depression weights
complete$iptw_u <- ifelse(complete$cesd23==1, 1/complete$predict, 1/(1-complete$predict)) 

#Distribution of unstabilized weights in the exposed
summary(complete$iptw_u[complete$cesd23==1]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.204   1.516   1.885   2.814   3.249  13.833
#Distribution of unstabilized weights in the unexposed
summary(complete$iptw_u[complete$cesd23==0]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.066   1.187   1.299   1.555   1.649   5.822
#Distribution of unstabilized weights in the total population
summary(complete$iptw_u)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.066   1.229   1.455   2.007   2.103  13.833
#Used marginal probability of exposure (for complete cases) to stabilize weights
p_exposure <- sum(complete$cesd23) / nrow(complete)
complete$iptw_s <- ifelse(complete$cesd23==1, p_exposure/complete$predict, (1-p_exposure)/(1-complete$predict))

#Distribution of stabilized weights in the exposed
summary(complete$iptw_s[complete$cesd23==1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.4323  0.5442  0.6770  1.0104  1.1667  4.9668
#Distribution of unstabilized weights in the unexposed
summary(complete$iptw_s[complete$cesd23==0]) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6834  0.7605  0.8328  0.9965  1.0570  3.7318
#Distribution of weights in the total population
summary(complete$iptw_s)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.4323  0.7189  0.8048  1.0015  1.0936  4.9668
#This is the reweighted population
ggplot(data=complete, aes(x=predict, weight=iptw_s, group=factor(cesd23), fill=factor(cesd23)))+
  geom_histogram(aes(y=..density..),alpha = 0.75,binwidth=0.02,position = position_dodge(width=0.01))+
  theme_classic()+
  xlab("Predicted probability of CES-D≥23")+
  labs(fill = "CES-D≥23")

#Estimate the weighted measures of effect
broom::tidy(geeglm(next_injectrisk ~ cesd23, family=binomial(link = "identity"), weight=iptw_s, id = pidnum, data = complete)) 
##          term   estimate  std.error statistic      p.value
## 1 (Intercept) 0.06030404 0.01243601 23.514223 1.239938e-06
## 2      cesd23 0.03244441 0.02163592  2.248688 1.337278e-01
broom::tidy(geeglm(next_injectrisk ~ cesd23, family=binomial(link = "log"), weight=iptw_s, id = pidnum, data = complete), exponentiate = T) 
##          term   estimate std.error  statistic   p.value
## 1 (Intercept) 0.06030404 0.2062219 185.453485 0.0000000
## 2      cesd23 1.53801383 0.2693021   2.555349 0.1099215