OPTIONS NOCENTER FORMCHAR = "|----|+|---+=";
                
TITLE1 'Data From: R. Mermelstein NCI smoking cessation project';   
TITLE2 'Looking at imputation of smoking variable - OR = 2';                
/* looking at last timepoint and collapsing controls & no-shows vs
   two treatment groups */

/* performing simulation for the missing observations - OR=2 for MISSING & SMOKING */
/* putting in uncertainty attributable to BETAs */
/* USING PROC MIANALYZE for getting the MI results */


DATA one;
INFILE 'c:\smoke.dat';       
INPUT  id smk miss smk0 grp;

/* these variables are coded as follows: 
     grp  0=tx   1=control
     smk  (smoking at final endpoint - 24 month followup)    0=abstinent 1=smoking
     smk0 (smoking at post-intervention)                     0=abstinent 1=smoking 
     miss (missing at final endpoint)                        0=observed  1=missing */

/* observed cell frequencies - these were obtained previous to this analysis */
n111 = 42;   /* number of abstainers - smk0=abstinent */
n112 = 71;   /* number of smokers    - smk0=abstinent */
n211 = 36;   /* number of abstainers - smk0=smoking   */
n212 = 223;  /* number of smokers    - smk0=smoking   */

/* using OR=2 for MISSING with SMOKING */
orat = 2;

beta0m = LOG(n112/n111);
beta1m = LOG(orat);
beta2m = LOG(n212/n211);
beta3m = LOG(orat);

seed  = 974677743; 


/* figure out cell frequencies for the missing cells                */
/* these are based on the observed cell frequencies and the assumed odds ratio */
n12dot = 37; /* number of missing for smk0=abstinent */
p122 = (orat*(n112/n111))/(1 + (orat*(n112/n111)));
n122 = p122*n12dot;
n121 = (1 - p122)*n12dot;

n22dot = 80; /* number of missing for smk0=smoking   */
p222 = (orat*(n212/n211))/(1 + (orat*(n212/n211)));
n222 = p222*n22dot;
n221 = (1 - p222)*n22dot;

/* variance associated with betas */
beta0v  = (n111 + n112)/(n111*n112);
beta1v  = 1/n111 + 1/n112 + 1/n121 + 1/n122;
beta2v  = (n211 + n212)/(n211*n212);
beta3v  = 1/n211 + 1/n212 + 1/n221 + 1/n222;


/* 100 simulated datasets */
DATA sim; SET one;
ARRAY smks(100) smks1-smks100; 
DO i = 1 TO 100;
IF miss EQ 0 THEN smks(i) = smk;
IF miss EQ 1 THEN DO;
   exp1 =  RANEXP(seed); exp2 = RANEXP(seed); stdl = LOG(exp1/exp2);
   ran0  = RANNOR(seed); ran1 = RANNOR(seed);

  /* the next lines incorporate the covariance between beta0 and beta1 
     using the Cholesky factorization of the variance-covariance matrix
     (and likewise for beta2 and beta3) */
   beta0 = beta0m + ran0*SQRT(beta0v);
   beta1 = beta1m - ran0*SQRT(beta0v) + ran1*SQRT(beta1v - beta0v);
   ran2  = RANNOR(seed); ran3 = RANNOR(seed);
   beta2 = beta2m + ran2*SQRT(beta2v);
   beta3 = beta3m - ran2*SQRT(beta2v) + ran3*SQRT(beta3v - beta2v);

   ystar = (1 - smk0)*(beta0 + beta1*miss) + smk0*(beta2 + beta3*miss) + stdl;

smks(i) = 0; IF ystar > 0 THEN smks(i) = 1;
END;
END;

/* set up the data in long format - with imputation number being a variable */
DATA unisim (KEEP = id grp smki _imputation_); SET sim;
ARRAY smks(100) smks1-smks100;
DO _imputation_ = 1 to 100;
   smki = smks(_imputation_);
   OUTPUT;
END;

PROC SORT; BY _imputation_ ;
RUN;

PROC LOGISTIC DESCENDING NOPRINT OUTEST=outlreg COVOUT;
MODEL smki = grp / LINK = LOGIT;
BY _imputation_ ;
RUN;

PROC MIANALYZE DATA=outlreg;
     VAR intercept grp;
RUN;