Statistical Matching using Optimal Transport

Introduction

In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc dataset of the laeken package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) doi:10.1016/j.jspi.2022.12.003.

Contingency table

We will estimate the contingency table when the factor variable which represents the economic status pl030 is crossed with a discretized version of the equivalized household income eqIncome. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.

library(laeken)
library(sampling)
library(StratifiedSampling)
#> Loading required package: Matrix

data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)


# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030


# categorial income splitted by the percentile
c_income  <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which(  eusilc$eqIncome > q[7] )] <- "(90,100]"

# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)

# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id

YZ <- table(cbind(Y,Z))
addmargins(YZ)
#>        c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]   Sum
#>     1      409     616     722     807     935    1025      648  5162
#>     2      189     181     205     184     165     154       82  1160
#>     3      137      90      72      75      59      52       33   518
#>     4      210     159     103      95      74      49       46   736
#>     5      470     462     492     477     459     435      351  3146
#>     6       57      25      28      30      17      11       10   178
#>     7      344     283     194     149     106      91       40  1207
#>     Sum   1816    1816    1816    1817    1815    1817     1210 12107

Sampling schemes

Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.


# size of sample
n1 <- 1000
n2 <- 500

# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
  
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
  
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
  
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
  
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
  
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
  
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
  
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]

Harmonization

Then the harmonization step must be performed. The harmonize function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.



re <- harmonize(X1,d1,id1,X2,d2,id2)  

# if we want to use the population totals to harmonize we can use 
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))

w1 <- re$w1
w2 <- re$w2

colSums(Xm)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w1*X1)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w2*X2)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915

Optimal transport matching

The statistical matching is done by using the otmatch function. The estimation of the contingency table is calculated by extracting the id1 units (respectively id2 units) and by using the function tapply with the correct weights.


# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#>        id1    id2     weight
#> 403    403  68202  0.3533823
#> 403.1  403 291303  0.4018261
#> 403.2  403 346804  8.8281150
#> 1102  1102   8202 16.9882065
#> 3601  3601  98702  9.6752551
#> 4401  4401 505402 12.9211189

Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)

# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0

# result
round(addmargins(YZ_ot),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    629.331  669.682  850.572  773.026  970.846  898.865  506.896  5299.217
#> 2     93.760  166.364  158.383  186.463  215.472  101.371  127.179  1048.991
#> 3     65.847   61.611   77.755  124.347  114.869   84.566   64.394   593.389
#> 4     87.433  148.091  168.803  153.275  109.343   63.747   64.877   795.568
#> 5    408.113  331.255  403.197  591.916  320.182  453.015  358.132  2865.809
#> 6     14.670   57.724   43.408   14.515   32.984   25.597   19.977   208.875
#> 7    197.106  163.012  196.737  190.799  233.535   88.251  225.710  1295.151
#> Sum 1496.261 1597.738 1898.854 2034.341 1997.230 1715.411 1367.165 12107.000

Balanced sampling

As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.


# Balanced Sampling 
BS <- bsmatch(object)
head(BS$object[,1:3])
#>         id1    id2    weight
#> 403.2   403 346804  8.828115
#> 1102   1102   8202 16.988207
#> 3601   3601  98702  9.675255
#> 4401   4401 505402 12.921119
#> 5802   5802 225103  6.295120
#> 5807.1 5807 535104  6.205533


Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    642.187  637.893  794.450  730.387  970.523  971.928  551.850  5299.217
#> 2    109.726  144.156  158.771  144.788  236.150  117.345  138.055  1048.991
#> 3     70.109   58.190   76.827  123.559  101.740   97.766   65.199   593.389
#> 4     84.974  171.265  178.762  141.893   87.308   63.371   67.996   795.568
#> 5    435.289  328.555  374.062  591.940  337.296  409.645  389.023  2865.809
#> 6     19.510   54.725   43.408    0.000   29.499   36.929   24.804   208.875
#> 7    202.206  164.054  170.426  213.932  259.763   66.919  217.852  1295.151
#> Sum 1563.999 1558.838 1796.705 1946.500 2022.278 1763.902 1454.778 12107.000

# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)

Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    664.176  624.817  884.100  753.818  961.680  905.805  504.822  5299.217
#> 2     88.449  161.440  135.434  208.980  198.057  144.442  112.189  1048.991
#> 3     59.268   78.260   86.470  119.127  101.381   83.683   65.199   593.389
#> 4     72.866  137.916  164.700  141.585  122.591   75.807   80.104   795.568
#> 5    401.952  393.652  374.130  602.795  347.135  402.837  343.309  2865.809
#> 6     19.510   43.550   43.408   21.966   29.499   26.138   24.804   208.875
#> 7    199.769  166.913  211.709  176.959  240.543   81.405  217.852  1295.151
#> Sum 1505.990 1606.548 1899.951 2025.229 2000.887 1720.117 1348.279 12107.000

Prediction


# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
  
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#>         (0,15]    (15,30] (30,45] (45,60]   (60,75] (75,90] (90,100]
#> [1,] 0.0000000 0.07880443       0       0 0.9211956       0        0
#> [2,] 1.0000000 0.00000000       0       0 0.0000000       0        0
#> [3,] 0.0000000 1.00000000       0       0 0.0000000       0        0
#> [4,] 0.0000000 0.00000000       1       0 0.0000000       0        0
#> [5,] 0.0000000 1.00000000       0       0 0.0000000       0        0
#> [6,] 0.7578724 0.24212757       0       0 0.0000000       0        0