Is your nationality a secret?–evidence from 2016 ESS data and regularization method
Author: Zhaokun Zhang, Zihan Zhang **For the challange problem **
Abstract
In machine learning, Lasso, Ridge regression and ealstic net are most common and widespread methods of regularization. In this artilce, we applied these methods with an intersting expriment, which is predicting nationality based on some personal characteristics and cognition. Through the analysis we hope to have a beter understanding of regularization and model selection and find out some important factors do distinguish citizens in different countries.
Full version
The note is written in R Markdown
with PDF Version available.
Data and Code
Data and code are available in this folder
Also, you view the code here.
#####################################
########### Nationality #############
#####################################
# install.packages("glmnet")
rm(list = ls())
library(AER)
library(mlogit)
library(glmnet)
library(nnet)
####### import data ############
ESS8_c5 <- read.csv("ESS8_c5.csv",header = T)
ESS8_c5 <- ESS8_c5[,-1]
ess <- na.omit(ESS8_c5)
n=nrow(ess)
train <- sample(n,0.8*n)
ess.tr <- ess[train,] ## Training set
ess.test <- ess[-train,] ## Test set
x <- data.frame(ess.tr[,3:148])
y <- ess.tr$cntry
x1 <- as.matrix(x)
mystats <- function(x,na.omit=F){
if(na.omit)
x <- x[!is.na(x)]
m <- mean(x)
n <- length(x)
s <- sd(x)
return(c(n=n,m=m,stdev=s))
}
dstats <- function(x)sapply(x,mystats)
myvars <- names(ess[3:148])
statable <- by(ess[myvars],ess$cntry,dstats)
######## Multnomial Logit Regression #######
attach(ess.tr)
mlfit <- multinom(cntry ~ . ,data = data.frame(ess.tr[,2:148]))
mlfit.yhat <- predict(mlfit, ess.test)
t1 <- table(mlfit.yhat,ess.test$cntry,dnn = c("predicted","true"))
t1
pred_accuracy_multi <- 1 - sum(diag(t1))/sum(t1)
pred_accuracy_multi
detach(ess.tr)
########## Cross validation LASSO ###########
lassofit <- glmnet(x1,y,family = "multinomial",standardize = T,nlambda = 50,alpha = 1) # example of a Lasso
# coef(lassofit)
plot(lassofit,xvar = "lambda",label = F,main = "Number of variables")
cvlassofit <- cv.glmnet(x1,y,family = "multinomial",standardize = T,alpha = 1)
coef(cvlassofit)
plot.cv.glmnet(cvlassofit,sign.lambda = 1)
plot(cvlassofit)
optimal_lambda <- cvlassofit$lambda.min
optimal_lambda
# Use the optimal lambda to fit the model
lassofit <- glmnet(x1,y,family = "multinomial",standardize = T,lambda = optimal_lambda,alpha = 1)
coef(lassofit)
lassofit.yhat <- predict(lassofit,newx = as.matrix(ess.test[,3:148]),s = optimal_lambda,type = "class")
t2 <- table(lassofit.yhat,ess.test$cntry)
t2
pred_accuracy_lasso <- 1 - sum(diag(t2))/sum(t2)
pred_accuracy_lasso
######## Cross Validation RIDGE ###########
ridgefit <- cv.glmnet(x1,y,family = "multinomial",standardize = T, alpha = 0)
optimal_lambda_ridge <- ridgefit$lambda.min
optimal_lambda_ridge
plot(ridgefit)
ridgefit.yhat <- predict(ridgefit,newx = as.matrix(ess.test[,3:148]),s = optimal_lambda_ridge,type="class")
t3 <- table(ridgefit.yhat,ess.test$cntry)
t3
pred_accuracy_ridge <- 1 - sum(diag(t3))/sum(t3)
pred_accuracy_ridge
######## cross Validation Elastic net #######
enfit <- glmnet(x1,y,family = "multinomial",standardize = T,lambda = 0.2, alpha=0.5)
en_05 <- cv.glmnet(x1,y,family = "multinomial",standardize = T, alpha=0.5)
optimal_lambda_en <- en_05$lambda.min
optimal_lambda_en
plot(en_05)
enfit.yhat <- predict(en_05,newx = as.matrix(ess.test[,3:148]),s = optimal_lambda_en,type="class")
t4 <- table(enfit.yhat,ess.test$cntry)
t4
pred_accuracy_en <- 1-sum(diag(t4))/sum(t4)
pred_accuracy_en