--- title: "Statistical Matching using Optimal Transport" author: "Raphaël Jauslin" date: "`r Sys.Date()`" output: prettydoc::html_pretty: theme: architect highlight: github vignette: > %\VignetteIndexEntry{Statistical Matching using Optimal Transport} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} library(knitr) library(prettydoc) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ## Introduction In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the `eusilc` dataset of the `laeken` package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) . ## Contingency table We will estimate the contingency table when the factor variable which represents the economic status `pl030` is crossed with a discretized version of the equivalized household income `eqIncome`. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values. ```{r contingency table} library(laeken) library(sampling) library(StratifiedSampling) data("eusilc") eusilc <- na.omit(eusilc) N <- nrow(eusilc) # Xm are the matching variables and id are identity of the units Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")] Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive)) Xm <- cbind(Xmcat,Xm[,-c(2,4,5)]) id <- eusilc$rb030 # categorial income splitted by the percentile c_income <- eusilc$eqIncome q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15)) c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]" c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]" c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]" c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]" c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]" c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]" c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]" # variable of interests Y <- data.frame(ecostat = eusilc$pl030) Z <- data.frame(c_income = c_income) # put same rownames rownames(Xm) <- rownames(Y) <- rownames(Z)<- id YZ <- table(cbind(Y,Z)) addmargins(YZ) ``` ## Sampling schemes Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities. ```{r sample_scheme} # size of sample n1 <- 1000 n2 <- 500 # samples s1 <- srswor(n1,N) s2 <- srswor(n2,N) # extract matching units X1 <- Xm[s1 == 1,] X2 <- Xm[s2 == 1,] # extract variable of interest Y1 <- data.frame(Y[s1 == 1,]) colnames(Y1) <- colnames(Y) Z2 <- as.data.frame(Z[s2 == 1,]) colnames(Z2) <- colnames(Z) # extract correct identities id1 <- id[s1 == 1] id2 <- id[s2 == 1] # put correct rownames rownames(Y1) <- id1 rownames(Z2) <- id2 # here weights are inverse of inclusion probabilities d1 <- rep(N/n1,n1) d2 <- rep(N/n2,n2) # disjunctive form Y_dis <- sampling::disjunctive(as.matrix(Y)) Z_dis <- sampling::disjunctive(as.matrix(Z)) Y1_dis <- Y_dis[s1 ==1,] Z2_dis <- Z_dis[s2 ==1,] ``` ## Harmonization Then the harmonization step must be performed. The `harmonize` function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function. ```{r harm} re <- harmonize(X1,d1,id1,X2,d2,id2) # if we want to use the population totals to harmonize we can use re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm))) w1 <- re$w1 w2 <- re$w2 colSums(Xm) colSums(w1*X1) colSums(w2*X2) ``` ## Optimal transport matching The statistical matching is done by using the `otmatch` function. The estimation of the contingency table is calculated by extracting the `id1` units (respectively `id2` units) and by using the function `tapply` with the correct weights. ```{r OT} # Optimal transport matching object <- otmatch(X1,id1,X2,id2,w1,w2) head(object[,1:3]) Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),]) Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),]) YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum) # transform NA into 0 YZ_ot[is.na(YZ_ot)] <- 0 # result round(addmargins(YZ_ot),3) ``` ## Balanced sampling As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The `bsmatch` function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2. ```{r BS} # Balanced Sampling BS <- bsmatch(object) head(BS$object[,1:3]) Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),]) Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),]) YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum) YZ_bs[is.na(YZ_bs)] <- 0 round(addmargins(YZ_bs),3) # With Z2 as auxiliary information for stratified balanced sampling. BS <- bsmatch(object,Z2) Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),]) Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),]) YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum) YZ_bs[is.na(YZ_bs)] <- 0 round(addmargins(YZ_bs),3) ``` ## Prediction ```{r} # split the weight by id1 q_l <- split(object$weight,f = object$id1) # normalize in each id1 q_l <- lapply(q_l, function(x){x/sum(x)}) q <- as.numeric(do.call(c,q_l)) Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),]) colnames(Z_pred) <- levels(factor(Z2$c_income)) head(Z_pred) ```