--- title: Non-linear latent variable models and error-in-variable models author: Klaus Kähler Holst date: "`r Sys.Date()`" output: rmarkdown::html_vignette: fig_caption: yes fig_width: 6 fig_height: 4 vignette: > %\VignetteIndexEntry{Non-linear latent variable models and error-in-variable models} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} bibliography: ref.bib --- ```{r include=FALSE } knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) mets <- lava:::versioncheck('mets', 1) fullVignette <- Sys.getenv("_R_FULL_VIGNETTE_") %in% c("1","TRUE") ``` ```{r load, results="hide",message=FALSE,warning=FALSE } library('lava') ``` We consider the measurement models given by \[X_{j} = \eta_{1} + \epsilon_{j}^{x}, \quad j=1,2,3\] \[Y_{j} = \eta_{2} + \epsilon_{j}^{y}, \quad j=1,2,3\] and with a structural model given by \[\eta_{2} = f(\eta_{1}) + Z + \zeta_{2}\label{ex:eta2}\] \[\eta_{1} = Z + \zeta_{1}\label{ex:eta1}\] with iid measurement errors \(\epsilon_{j}^{x},\epsilon_{j}^{y},\zeta_{1},\zeta_{2}\sim\mathcal{N}(0,1), j=1,2,3.\) and standard normal distributed covariate \(Z\). To simulate from this model we use the following syntax: ```{r sim } f <- function(x) cos(1.25*x) + x - 0.25*x^2 m <- lvm(x1+x2+x3 ~ eta1, y1+y2+y3 ~ eta2, latent=~eta1+eta2) regression(m) <- eta1+eta2 ~ z functional(m, eta2~eta1) <- f d <- sim(m, n=200, seed=42) # Default is all parameters are 1 ``` ```{r } ## plot(m, plot.engine="visNetwork") plot(m) ``` We refer to [@holst_budtzjorgensen_2013] for details on the syntax for model specification. # Estimation To estimate the parameters using the two-stage estimator described in [@holst_budtzjorgensen_2020], the first step is now to specify the measurement models ```{r specifymodels } m1 <- lvm(x1+x2+x3 ~ eta1, eta1 ~ z, latent=~eta1) m2 <- lvm(y1+y2+y3 ~ eta2, eta2 ~ z, latent=~eta2) ``` Next, we specify a quadratic relationship between the two latent variables ```{r } nonlinear(m2, type="quadratic") <- eta2 ~ eta1 ``` and the model can then be estimated using the two-stage estimator ```{r twostage1 } e1 <- twostage(m1, m2, data=d) e1 ``` We see a clear statistically significant effect of the second order term (`eta2~eta1_2`). For comparison we can also estimate the full MLE of the linear model: ```{r linear_mle } e0 <- estimate(regression(m1%++%m2, eta2~eta1), d) estimate(e0,keep="^eta2~[a-z]",regex=TRUE) ## Extract coef. matching reg.ex. ``` Next, we calculate predictions from the quadratic model using the estimated parameter coefficients \[ \mathbb{E}_{\widehat{\theta}_{2}}(\eta_{2} \mid \eta_{1}, Z=0), \] ```{r pred1 } newd <- expand.grid(eta1=seq(-4, 4, by=0.1), z=0) pred1 <- predict(e1, newdata=newd, x=TRUE) head(pred1) ``` To obtain a potential better fit we next proceed with a natural cubic spline ```{r spline_twostage } kn <- seq(-3,3,length.out=5) nonlinear(m2, type="spline", knots=kn) <- eta2 ~ eta1 e2 <- twostage(m1, m2, data=d) e2 ``` Confidence limits can be obtained via the Delta method using the `estimate` method: ```{r spline_ci } p <- cbind(eta1=newd$eta1, estimate(e2,f=function(p) predict(e2,p=p,newdata=newd))$coefmat) head(p) ``` The fitted function can be obtained with the following code: ```{r figpred2 } plot(I(eta2-z) ~ eta1, data=d, col=Col("black",0.5), pch=16, xlab=expression(eta[1]), ylab=expression(eta[2]), xlim=c(-4,4)) lines(Estimate ~ eta1, data=as.data.frame(p), col="darkblue", lwd=5) confband(p[,1], lower=p[,4], upper=p[,5], polygon=TRUE, border=NA, col=Col("darkblue",0.2)) ``` # Cross-validation A more formal comparison of the different models can be obtained by cross-validation. Here we specify linear, quadratic and cubic spline models with 4 and 9 degrees of freedom. ```{r spline_several } m2a <- nonlinear(m2, type="linear", eta2~eta1) m2b <- nonlinear(m2, type="quadratic", eta2~eta1) kn1 <- seq(-3,3,length.out=5) kn2 <- seq(-3,3,length.out=8) m2c <- nonlinear(m2, type="spline", knots=kn1, eta2~eta1) m2d <- nonlinear(m2, type="spline", knots=kn2, eta2~eta1) ``` To assess the model fit average RMSE is estimated with 5-fold cross-validation repeated two times ```{r cv_fit, cache=TRUE, eval=fullVignette } ## Scale models in stage 2 to allow for a fair RMSE comparison d0 <- d for (i in endogenous(m2)) d0[,i] <- scale(d0[,i],center=TRUE,scale=TRUE) ## Repeated 5-fold cross-validation: ff <- lapply(list(linear=m2a,quadratic=m2b,spline4=m2c,spline6=m2d), function(m) function(data,...) twostage(m1,m,data=data,stderr=FALSE,control=list(start=coef(e0),contrain=TRUE))) fit.cv <- lava:::cv(ff,data=d,K=5,rep=2,mc.cores=parallel::detectCores(),seed=1) ``` ```{r results="hide", echo=FALSE } ## To save time building the vignettes on CRAN, we cache time consuming computations if (fullVignette) { fit.cv$fit <- NULL saveRDS(fit.cv, "data/nonlinear_fitcv.rds") } else { fit.cv <- readRDS("data/nonlinear_fitcv.rds") } ``` ```{r } summary(fit.cv) ``` Here the RMSE is in favour of the splines model with 4 degrees of freedom: ```{r multifit } fit <- lapply(list(m2a,m2b,m2c,m2d), function(x) { e <- twostage(m1,x,data=d) pr <- cbind(eta1=newd$eta1,predict(e,newdata=newd$eta1,x=TRUE)) return(list(estimate=e,predict=as.data.frame(pr))) }) plot(I(eta2-z) ~ eta1, data=d, col=Col("black",0.5), pch=16, xlab=expression(eta[1]), ylab=expression(eta[2]), xlim=c(-4,4)) col <- c("orange","darkred","darkgreen","darkblue") lty <- c(3,4,1,5) for (i in seq_along(fit)) { with(fit[[i]]$pr, lines(eta2 ~ eta1, col=col[i], lwd=4, lty=lty[i])) } legend("bottomright", c("linear","quadratic","spline(df=4)","spline(df=6)"), col=col, lty=lty, lwd=3) ``` For convenience, the function `twostageCV` can be used to do the cross-validation (also for choosing the mixture distribution via the \`\`nmix\`\` argument, see the section below). For example, ```{r twostageCV, cache=TRUE, eval=fullVignette } selmod <- twostageCV(m1, m2, data=d, df=2:4, nmix=1:2, nfolds=2, rep=1, mc.cores=parallel::detectCores()) ``` ```{r results="hide", echo=FALSE } ## To save time building the vignettes on CRAN, we cache time consuming computations if (fullVignette) { saveRDS(summary(selmod), "data/nonlinear_selmod.rds") } else { selmod <- readRDS("data/nonlinear_selmod.rds") } ``` applies cross-validation (here just 2 folds for simplicity) to select the best splines with degrees of freedom varying from from 1-3 (the linear model is automatically included) ```{r } selmod ``` # Specification of general functional forms Next, we show how to specify a general functional relation of multiple different latent or exogenous variables. This is achieved via the `predict.fun` argument. To illustrate this we include interactions between the latent variable \(\eta_{1}\) and a dichotomized version of the covariate \(z\) ```{r } d$g <- (d$z<0)*1 ## Group variable mm1 <- regression(m1, ~g) # Add grouping variable as exogenous variable (effect specified via 'predict.fun') mm2 <- regression(m2, eta2~ u1+u2+u1:g+u2:g+z) pred <- function(mu,var,data,...) { cbind("u1"=mu[,1],"u2"=mu[,1]^2+var[1], "u1:g"=mu[,1]*data[,"g"],"u2:g"=(mu[,1]^2+var[1])*data[,"g"]) } ee1 <- twostage(mm1, model2=mm2, data=d, predict.fun=pred) estimate(ee1,keep="eta2~u",regex=TRUE) ``` A formal test show no statistically significant effect of this interaction ```{r } summary(estimate(ee1,keep="(:g)", regex=TRUE)) ``` # Mixture models Lastly, we demonstrate how the distributional assumptions of stage 1 model can be relaxed by letting the conditional distribution of the latent variable given covariates follow a Gaussian mixture distribution. The following code explictly defines the parameter constraints of the model by setting the intercept of the first indicator variable, \(x_{1}\), to zero and the factor loading parameter of the same variable to one. ```{r } m1 <- baptize(m1) ## Label all parameters intercept(m1, ~x1+eta1) <- list(0,NA) ## Set intercept of x1 to zero. Remove the label of eta1 regression(m1,x1~eta1) <- 1 ## Factor loading fixed to 1 ``` The mixture model may then be estimated using the `mixture` method (note, this requires the `mets` package to be installed), where the Parameter names shared across the different mixture components given in the `list` will be constrained to be identical in the mixture model. Thus, only the intercept of \(\eta_{1}\) is allowed to vary between the mixtures. ```{r mixture1, cache=TRUE, eval=fullVignette } set.seed(1) em0 <- mixture(m1, k=2, data=d) ``` To decrease the risk of using a local maximizer of the likelihood we can rerun the estimation with different random starting values ```{r estmixture, cache=TRUE,warnings=FALSE,messages=FALSE,eval=FALSE } em0 <- NULL ll <- c() for (i in 1:5) { set.seed(i) em <- mixture(m1, k=2, data=d, control=list(trace=0)) ll <- c(ll,logLik(em)) if (is.null(em0) || logLik(em0)