library(glmnet)
library(caret)
For this exercise we use the Communities and Crime data from the UCI ML repository, which includes information about communities in the US. “The data combines socio-economic data from the 1990 US Census, law enforcement data from the 1990 US LEMAS survey, and crime data from the 1995 FBI UCR”
Source: https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime
First, some data prep.
crime <- read.csv("/Users/zhoudongqiang/Downloads/communities.data", header = FALSE, na.strings = "?")
varnames <- read.delim("/Users/zhoudongqiang/Downloads/communities.txt", header = FALSE)
Clean name vector and use as variable names.
varnames <- as.character(varnames$V1)
varnames <- gsub("@attribute ", "", varnames)
varnames <- gsub(" numeric", "", varnames)
varnames <- gsub(" string", "", varnames)
names(crime) <- varnames
To make things easier, drop columns with missing values.
crime <- crime[, colSums(is.na(crime)) == 0]
Check whats left.
str(crime)
## 'data.frame': 1994 obs. of 103 variables:
## $ state : int 8 53 24 34 42 6 44 6 21 29 ...
## $ communityname : chr "Lakewoodcity" "Tukwilacity" "Aberdeentown" "Willingborotownship" ...
## $ fold : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : num 0.19 0 0 0.04 0.01 0.02 0.01 0.01 0.03 0.01 ...
## $ householdsize : num 0.33 0.16 0.42 0.77 0.55 0.28 0.39 0.74 0.34 0.4 ...
## $ racepctblack : num 0.02 0.12 0.49 1 0.02 0.06 0 0.03 0.2 0.06 ...
## $ racePctWhite : num 0.9 0.74 0.56 0.08 0.95 0.54 0.98 0.46 0.84 0.87 ...
## $ racePctAsian : num 0.12 0.45 0.17 0.12 0.09 1 0.06 0.2 0.02 0.3 ...
## $ racePctHisp : num 0.17 0.07 0.04 0.1 0.05 0.25 0.02 1 0 0.03 ...
## $ agePct12t21 : num 0.34 0.26 0.39 0.51 0.38 0.31 0.3 0.52 0.38 0.9 ...
## $ agePct12t29 : num 0.47 0.59 0.47 0.5 0.38 0.48 0.37 0.55 0.45 0.82 ...
## $ agePct16t24 : num 0.29 0.35 0.28 0.34 0.23 0.27 0.23 0.36 0.28 0.8 ...
## $ agePct65up : num 0.32 0.27 0.32 0.21 0.36 0.37 0.6 0.35 0.48 0.39 ...
## $ numbUrban : num 0.2 0.02 0 0.06 0.02 0.04 0.02 0 0.04 0.02 ...
## $ pctUrban : num 1 1 0 1 0.9 1 0.81 0 1 1 ...
## $ medIncome : num 0.37 0.31 0.3 0.58 0.5 0.52 0.42 0.16 0.17 0.54 ...
## $ pctWWage : num 0.72 0.72 0.58 0.89 0.72 0.68 0.5 0.44 0.47 0.59 ...
## $ pctWFarmSelf : num 0.34 0.11 0.19 0.21 0.16 0.2 0.23 1 0.36 0.22 ...
## $ pctWInvInc : num 0.6 0.45 0.39 0.43 0.68 0.61 0.68 0.23 0.34 0.86 ...
## $ pctWSocSec : num 0.29 0.25 0.38 0.36 0.44 0.28 0.61 0.53 0.55 0.42 ...
## $ pctWPubAsst : num 0.15 0.29 0.4 0.2 0.11 0.15 0.21 0.97 0.48 0.02 ...
## $ pctWRetire : num 0.43 0.39 0.84 0.82 0.71 0.25 0.54 0.41 0.43 0.31 ...
## $ medFamInc : num 0.39 0.29 0.28 0.51 0.46 0.62 0.43 0.15 0.21 0.85 ...
## $ perCapInc : num 0.4 0.37 0.27 0.36 0.43 0.72 0.47 0.1 0.23 0.89 ...
## $ whitePerCap : num 0.39 0.38 0.29 0.4 0.41 0.76 0.44 0.12 0.23 0.94 ...
## $ blackPerCap : num 0.32 0.33 0.27 0.39 0.28 0.77 0.4 0.08 0.19 0.11 ...
## $ indianPerCap : num 0.27 0.16 0.07 0.16 0 0.28 0.24 0.17 0.1 0.09 ...
## $ AsianPerCap : num 0.27 0.3 0.29 0.25 0.74 0.52 0.86 0.27 0.26 0.33 ...
## $ HispPerCap : num 0.41 0.35 0.39 0.44 0.48 0.6 0.36 0.21 0.22 0.8 ...
## $ NumUnderPov : num 0.08 0.01 0.01 0.01 0 0.01 0.01 0.03 0.04 0 ...
## $ PctPopUnderPov : num 0.19 0.24 0.27 0.1 0.06 0.12 0.11 0.64 0.45 0.11 ...
## $ PctLess9thGrade : num 0.1 0.14 0.27 0.09 0.25 0.13 0.29 0.96 0.52 0.04 ...
## $ PctNotHSGrad : num 0.18 0.24 0.43 0.25 0.3 0.12 0.41 0.82 0.59 0.03 ...
## $ PctBSorMore : num 0.48 0.3 0.19 0.31 0.33 0.8 0.36 0.12 0.17 1 ...
## $ PctUnemployed : num 0.27 0.27 0.36 0.33 0.12 0.1 0.28 1 0.55 0.11 ...
## $ PctEmploy : num 0.68 0.73 0.58 0.71 0.65 0.65 0.54 0.26 0.43 0.44 ...
## $ PctEmplManu : num 0.23 0.57 0.32 0.36 0.67 0.19 0.44 0.43 0.59 0.2 ...
## $ PctEmplProfServ : num 0.41 0.15 0.29 0.45 0.38 0.77 0.53 0.34 0.36 1 ...
## $ PctOccupManu : num 0.25 0.42 0.49 0.37 0.42 0.06 0.33 0.71 0.64 0.02 ...
## $ PctOccupMgmtProf : num 0.52 0.36 0.32 0.39 0.46 0.91 0.49 0.18 0.29 0.96 ...
## $ MalePctDivorce : num 0.68 1 0.63 0.34 0.22 0.49 0.25 0.38 0.62 0.3 ...
## $ MalePctNevMarr : num 0.4 0.63 0.41 0.45 0.27 0.57 0.34 0.47 0.26 0.85 ...
## $ FemalePctDiv : num 0.75 0.91 0.71 0.49 0.2 0.61 0.28 0.59 0.66 0.39 ...
## $ TotalPctDiv : num 0.75 1 0.7 0.44 0.21 0.58 0.28 0.52 0.67 0.36 ...
## $ PersPerFam : num 0.35 0.29 0.45 0.75 0.51 0.44 0.42 0.78 0.37 0.31 ...
## $ PctFam2Par : num 0.55 0.43 0.42 0.65 0.91 0.62 0.77 0.45 0.51 0.65 ...
## $ PctKids2Par : num 0.59 0.47 0.44 0.54 0.91 0.69 0.81 0.43 0.55 0.73 ...
## $ PctYoungKids2Par : num 0.61 0.6 0.43 0.83 0.89 0.87 0.79 0.34 0.58 0.78 ...
## $ PctTeen2Par : num 0.56 0.39 0.43 0.65 0.85 0.53 0.74 0.34 0.47 0.67 ...
## $ PctWorkMomYoungKids : num 0.74 0.46 0.71 0.85 0.4 0.3 0.57 0.29 0.65 0.72 ...
## $ PctWorkMom : num 0.76 0.53 0.67 0.86 0.6 0.43 0.62 0.27 0.64 0.71 ...
## $ NumIlleg : num 0.04 0 0.01 0.03 0 0 0 0.02 0.02 0 ...
## $ PctIlleg : num 0.14 0.24 0.46 0.33 0.06 0.11 0.13 0.5 0.29 0.07 ...
## $ NumImmig : num 0.03 0.01 0 0.02 0 0.04 0.01 0.02 0 0.01 ...
## $ PctImmigRecent : num 0.24 0.52 0.07 0.11 0.03 0.3 0 0.5 0.12 0.41 ...
## $ PctImmigRec5 : num 0.27 0.62 0.06 0.2 0.07 0.35 0.02 0.59 0.09 0.44 ...
## $ PctImmigRec8 : num 0.37 0.64 0.15 0.3 0.2 0.43 0.02 0.65 0.07 0.52 ...
## $ PctImmigRec10 : num 0.39 0.63 0.19 0.31 0.27 0.47 0.1 0.59 0.13 0.48 ...
## $ PctRecentImmig : num 0.07 0.25 0.02 0.05 0.01 0.5 0 0.69 0 0.22 ...
## $ PctRecImmig5 : num 0.07 0.27 0.02 0.08 0.02 0.5 0.01 0.72 0 0.21 ...
## $ PctRecImmig8 : num 0.08 0.25 0.04 0.11 0.04 0.56 0.01 0.71 0 0.22 ...
## $ PctRecImmig10 : num 0.08 0.23 0.05 0.11 0.05 0.57 0.03 0.6 0 0.19 ...
## $ PctSpeakEnglOnly : num 0.89 0.84 0.88 0.81 0.88 0.45 0.73 0.12 0.99 0.85 ...
## $ PctNotSpeakEnglWell : num 0.06 0.1 0.04 0.08 0.05 0.28 0.05 0.93 0.01 0.03 ...
## $ PctLargHouseFam : num 0.14 0.16 0.2 0.56 0.16 0.25 0.12 0.74 0.12 0.09 ...
## $ PctLargHouseOccup : num 0.13 0.1 0.2 0.62 0.19 0.19 0.13 0.75 0.12 0.06 ...
## $ PersPerOccupHous : num 0.33 0.17 0.46 0.85 0.59 0.29 0.42 0.8 0.35 0.15 ...
## $ PersPerOwnOccHous : num 0.39 0.29 0.52 0.77 0.6 0.53 0.54 0.68 0.38 0.34 ...
## $ PersPerRentOccHous : num 0.28 0.17 0.43 1 0.37 0.18 0.24 0.92 0.33 0.05 ...
## $ PctPersOwnOccup : num 0.55 0.26 0.42 0.94 0.89 0.39 0.65 0.39 0.5 0.48 ...
## $ PctPersDenseHous : num 0.09 0.2 0.15 0.12 0.02 0.26 0.03 0.89 0.1 0.03 ...
## $ PctHousLess3BR : num 0.51 0.82 0.51 0.01 0.19 0.73 0.46 0.66 0.64 0.58 ...
## $ MedNumBR : num 0.5 0 0.5 0.5 0.5 0 0.5 0 0 0 ...
## $ HousVacant : num 0.21 0.02 0.01 0.01 0.01 0.02 0.01 0.01 0.04 0.02 ...
## $ PctHousOccup : num 0.71 0.79 0.86 0.97 0.89 0.84 0.89 0.91 0.72 0.72 ...
## $ PctHousOwnOcc : num 0.52 0.24 0.41 0.96 0.87 0.3 0.57 0.46 0.49 0.38 ...
## $ PctVacantBoarded : num 0.05 0.02 0.29 0.6 0.04 0.16 0.09 0.22 0.05 0.07 ...
## $ PctVacMore6Mos : num 0.26 0.25 0.3 0.47 0.55 0.28 0.49 0.37 0.49 0.47 ...
## $ MedYrHousBuilt : num 0.65 0.65 0.52 0.52 0.73 0.25 0.38 0.6 0.5 0.04 ...
## $ PctHousNoPhone : num 0.14 0.16 0.47 0.11 0.05 0.02 0.05 0.28 0.57 0.01 ...
## $ PctWOFullPlumb : num 0.06 0 0.45 0.11 0.14 0.05 0.05 0.23 0.22 0 ...
## $ OwnOccLowQuart : num 0.22 0.21 0.18 0.24 0.31 0.94 0.37 0.15 0.07 0.63 ...
## $ OwnOccMedVal : num 0.19 0.2 0.17 0.21 0.31 1 0.38 0.13 0.07 0.71 ...
## $ OwnOccHiQuart : num 0.18 0.21 0.16 0.19 0.3 1 0.39 0.13 0.08 0.79 ...
## $ RentLowQ : num 0.36 0.42 0.27 0.75 0.4 0.67 0.26 0.21 0.14 0.44 ...
## $ RentMedian : num 0.35 0.38 0.29 0.7 0.36 0.63 0.35 0.24 0.17 0.42 ...
## $ RentHighQ : num 0.38 0.4 0.27 0.77 0.38 0.68 0.42 0.25 0.16 0.47 ...
## $ MedRent : num 0.34 0.37 0.31 0.89 0.38 0.62 0.35 0.24 0.15 0.41 ...
## $ MedRentPctHousInc : num 0.38 0.29 0.48 0.63 0.22 0.47 0.46 0.64 0.38 0.23 ...
## $ MedOwnCostPctInc : num 0.46 0.32 0.39 0.51 0.51 0.59 0.44 0.59 0.13 0.27 ...
## $ MedOwnCostPctIncNoMtg: num 0.25 0.18 0.28 0.47 0.21 0.11 0.31 0.28 0.36 0.28 ...
## $ NumInShelters : num 0.04 0 0 0 0 0 0 0 0.01 0 ...
## $ NumStreet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PctForeignBorn : num 0.12 0.21 0.14 0.19 0.11 0.7 0.15 0.59 0.01 0.22 ...
## $ PctBornSameState : num 0.42 0.5 0.49 0.3 0.72 0.42 0.81 0.58 0.78 0.42 ...
## $ PctSameHouse85 : num 0.5 0.34 0.54 0.73 0.64 0.49 0.77 0.52 0.48 0.34 ...
## $ PctSameCity85 : num 0.51 0.6 0.67 0.64 0.61 0.73 0.91 0.79 0.79 0.23 ...
## $ PctSameState85 : num 0.64 0.52 0.56 0.65 0.53 0.64 0.84 0.78 0.75 0.09 ...
## $ LandArea : num 0.12 0.02 0.01 0.02 0.04 0.01 0.05 0.01 0.04 0 ...
## [list output truncated]
Next, we want to split the data into a training (75%) and a test
(25%) part. This can be done by random sampling with
sample. Note that there is a fold variable in
the data set, but here we want to follow our own train/test
procedure.
crime
varnames
## [1] "state" "county" "community"
## [4] "communityname" "fold" "population"
## [7] "householdsize" "racepctblack" "racePctWhite"
## [10] "racePctAsian" "racePctHisp" "agePct12t21"
## [13] "agePct12t29" "agePct16t24" "agePct65up"
## [16] "numbUrban" "pctUrban" "medIncome"
## [19] "pctWWage" "pctWFarmSelf" "pctWInvInc"
## [22] "pctWSocSec" "pctWPubAsst" "pctWRetire"
## [25] "medFamInc" "perCapInc" "whitePerCap"
## [28] "blackPerCap" "indianPerCap" "AsianPerCap"
## [31] "OtherPerCap" "HispPerCap" "NumUnderPov"
## [34] "PctPopUnderPov" "PctLess9thGrade" "PctNotHSGrad"
## [37] "PctBSorMore" "PctUnemployed" "PctEmploy"
## [40] "PctEmplManu" "PctEmplProfServ" "PctOccupManu"
## [43] "PctOccupMgmtProf" "MalePctDivorce" "MalePctNevMarr"
## [46] "FemalePctDiv" "TotalPctDiv" "PersPerFam"
## [49] "PctFam2Par" "PctKids2Par" "PctYoungKids2Par"
## [52] "PctTeen2Par" "PctWorkMomYoungKids" "PctWorkMom"
## [55] "NumIlleg" "PctIlleg" "NumImmig"
## [58] "PctImmigRecent" "PctImmigRec5" "PctImmigRec8"
## [61] "PctImmigRec10" "PctRecentImmig" "PctRecImmig5"
## [64] "PctRecImmig8" "PctRecImmig10" "PctSpeakEnglOnly"
## [67] "PctNotSpeakEnglWell" "PctLargHouseFam" "PctLargHouseOccup"
## [70] "PersPerOccupHous" "PersPerOwnOccHous" "PersPerRentOccHous"
## [73] "PctPersOwnOccup" "PctPersDenseHous" "PctHousLess3BR"
## [76] "MedNumBR" "HousVacant" "PctHousOccup"
## [79] "PctHousOwnOcc" "PctVacantBoarded" "PctVacMore6Mos"
## [82] "MedYrHousBuilt" "PctHousNoPhone" "PctWOFullPlumb"
## [85] "OwnOccLowQuart" "OwnOccMedVal" "OwnOccHiQuart"
## [88] "RentLowQ" "RentMedian" "RentHighQ"
## [91] "MedRent" "MedRentPctHousInc" "MedOwnCostPctInc"
## [94] "MedOwnCostPctIncNoMtg" "NumInShelters" "NumStreet"
## [97] "PctForeignBorn" "PctBornSameState" "PctSameHouse85"
## [100] "PctSameCity85" "PctSameState85" "LemasSwornFT"
## [103] "LemasSwFTPerPop" "LemasSwFTFieldOps" "LemasSwFTFieldPerPop"
## [106] "LemasTotalReq" "LemasTotReqPerPop" "PolicReqPerOffic"
## [109] "PolicPerPop" "RacialMatchCommPol" "PctPolicWhite"
## [112] "PctPolicBlack" "PctPolicHisp" "PctPolicAsian"
## [115] "PctPolicMinor" "OfficAssgnDrugUnits" "NumKindsDrugsSeiz"
## [118] "PolicAveOTWorked" "LandArea" "PopDens"
## [121] "PctUsePubTrans" "PolicCars" "PolicOperBudg"
## [124] "LemasPctPolicOnPatr" "LemasGangUnitDeploy" "LemasPctOfficDrugUn"
## [127] "PolicBudgPerPop" "ViolentCrimesPerPop"
names(crime)
## [1] "state" "communityname" "fold"
## [4] "population" "householdsize" "racepctblack"
## [7] "racePctWhite" "racePctAsian" "racePctHisp"
## [10] "agePct12t21" "agePct12t29" "agePct16t24"
## [13] "agePct65up" "numbUrban" "pctUrban"
## [16] "medIncome" "pctWWage" "pctWFarmSelf"
## [19] "pctWInvInc" "pctWSocSec" "pctWPubAsst"
## [22] "pctWRetire" "medFamInc" "perCapInc"
## [25] "whitePerCap" "blackPerCap" "indianPerCap"
## [28] "AsianPerCap" "HispPerCap" "NumUnderPov"
## [31] "PctPopUnderPov" "PctLess9thGrade" "PctNotHSGrad"
## [34] "PctBSorMore" "PctUnemployed" "PctEmploy"
## [37] "PctEmplManu" "PctEmplProfServ" "PctOccupManu"
## [40] "PctOccupMgmtProf" "MalePctDivorce" "MalePctNevMarr"
## [43] "FemalePctDiv" "TotalPctDiv" "PersPerFam"
## [46] "PctFam2Par" "PctKids2Par" "PctYoungKids2Par"
## [49] "PctTeen2Par" "PctWorkMomYoungKids" "PctWorkMom"
## [52] "NumIlleg" "PctIlleg" "NumImmig"
## [55] "PctImmigRecent" "PctImmigRec5" "PctImmigRec8"
## [58] "PctImmigRec10" "PctRecentImmig" "PctRecImmig5"
## [61] "PctRecImmig8" "PctRecImmig10" "PctSpeakEnglOnly"
## [64] "PctNotSpeakEnglWell" "PctLargHouseFam" "PctLargHouseOccup"
## [67] "PersPerOccupHous" "PersPerOwnOccHous" "PersPerRentOccHous"
## [70] "PctPersOwnOccup" "PctPersDenseHous" "PctHousLess3BR"
## [73] "MedNumBR" "HousVacant" "PctHousOccup"
## [76] "PctHousOwnOcc" "PctVacantBoarded" "PctVacMore6Mos"
## [79] "MedYrHousBuilt" "PctHousNoPhone" "PctWOFullPlumb"
## [82] "OwnOccLowQuart" "OwnOccMedVal" "OwnOccHiQuart"
## [85] "RentLowQ" "RentMedian" "RentHighQ"
## [88] "MedRent" "MedRentPctHousInc" "MedOwnCostPctInc"
## [91] "MedOwnCostPctIncNoMtg" "NumInShelters" "NumStreet"
## [94] "PctForeignBorn" "PctBornSameState" "PctSameHouse85"
## [97] "PctSameCity85" "PctSameState85" "LandArea"
## [100] "PopDens" "PctUsePubTrans" "LemasPctOfficDrugUn"
## [103] "ViolentCrimesPerPop"
Next, we want to split the data into a training (75%) and a test
(25%) part. This can be done by random sampling with
sample. Note that there is a fold variable in
the data set, but here we want to follow our own train/test
procedure.
set.seed(3940)
train<- sample(1:nrow(crime),0.75*nrow(crime))
crime_train <- crime[train,]
crime_test <- crime[-train,]
Now, prepare the training data for running regularized regression
models via glmnet. Our prediction outcome is
ViolentCrimesPerPop. As X, use all variables except
state, communityname, and
fold.
X <- model.matrix(ViolentCrimesPerPop~ . -state -communityname -fold,crime_train)[,-1]
y <-crime_train$ViolentCrimesPerPop
Check whether X looks ok.
dim(X)
## [1] 1495 99
Estimate a sequence of Lasso models using glmnet. You
can stick with the defaults for choosing a range of lambdas.
m1 <- glmnet(X, y, alpha = 1)
summary(m1)
## Length Class Mode
## a0 84 -none- numeric
## beta 8316 dgCMatrix S4
## df 84 -none- numeric
## dim 2 -none- numeric
## lambda 84 -none- numeric
## dev.ratio 84 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
Here we want to display lambda and the coefficients of the first Lasso model.
m1$lambda[1]
## [1] 0.173677
Same for the last Lasso model.
m1$lambda[ncol(m1$beta)]
## [1] 7.694969e-05
m1$beta[, ncol(m1$beta)]
## population householdsize racepctblack
## 0.0000000000 -0.0467119436 0.1405420386
## racePctWhite racePctAsian racePctHisp
## -0.0784110189 0.0033650951 0.0377839699
## agePct12t21 agePct12t29 agePct16t24
## 0.0547096664 -0.2420116460 -0.0794665694
## agePct65up numbUrban pctUrban
## 0.0734592372 -0.1057359055 0.0462646090
## medIncome pctWWage pctWFarmSelf
## -0.1068622443 -0.2727411674 0.0361572030
## pctWInvInc pctWSocSec pctWPubAsst
## -0.1206884786 0.0000000000 -0.0457339641
## pctWRetire medFamInc perCapInc
## -0.0987426653 0.1883069483 0.0000000000
## whitePerCap blackPerCap indianPerCap
## -0.2237584263 -0.0085220184 -0.0269119338
## AsianPerCap HispPerCap NumUnderPov
## 0.0109334338 0.0715714495 -0.0387939791
## PctPopUnderPov PctLess9thGrade PctNotHSGrad
## -0.1885192255 -0.1390510982 0.1619277742
## PctBSorMore PctUnemployed PctEmploy
## 0.0841713604 0.0000000000 0.2321456493
## PctEmplManu PctEmplProfServ PctOccupManu
## -0.0747049577 -0.0158531222 0.0783355017
## PctOccupMgmtProf MalePctDivorce MalePctNevMarr
## 0.0835359850 0.3437189719 0.2186017045
## FemalePctDiv TotalPctDiv PersPerFam
## 0.0000000000 -0.3410677185 -0.0232508743
## PctFam2Par PctKids2Par PctYoungKids2Par
## -0.0304917818 -0.3636826150 0.0011372497
## PctTeen2Par PctWorkMomYoungKids PctWorkMom
## -0.0056646002 0.0456618960 -0.1759819996
## NumIlleg PctIlleg NumImmig
## -0.0354110249 0.1521929130 -0.1364410471
## PctImmigRecent PctImmigRec5 PctImmigRec8
## 0.0318542348 -0.0156197365 -0.0011773428
## PctImmigRec10 PctRecentImmig PctRecImmig5
## 0.0004034034 -0.0317792184 0.0000000000
## PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly
## 0.0228000793 0.0000000000 -0.0194631398
## PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup
## -0.1057176919 -0.1438440982 0.0000000000
## PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous
## 0.6418167155 -0.1458678685 -0.1602083407
## PctPersOwnOccup PctPersDenseHous PctHousLess3BR
## -0.3014884104 0.1766640078 0.0889800426
## MedNumBR HousVacant PctHousOccup
## 0.0021261371 0.2634342568 -0.0462776596
## PctHousOwnOcc PctVacantBoarded PctVacMore6Mos
## 0.1578368435 0.0296965895 -0.0629292133
## MedYrHousBuilt PctHousNoPhone PctWOFullPlumb
## -0.0240986976 0.0248413661 -0.0153998620
## OwnOccLowQuart OwnOccMedVal OwnOccHiQuart
## -0.1145319372 0.0000000000 0.0710877960
## RentLowQ RentMedian RentHighQ
## -0.2191923978 0.0000000000 -0.0063219422
## MedRent MedRentPctHousInc MedOwnCostPctInc
## 0.2636057732 0.0485788475 -0.0604851199
## MedOwnCostPctIncNoMtg NumInShelters NumStreet
## -0.0824573364 0.0594862269 0.2214652393
## PctForeignBorn PctBornSameState PctSameHouse85
## 0.0747635213 0.0037892268 -0.0423618597
## PctSameCity85 PctSameState85 LandArea
## 0.0072765065 0.0455436207 -0.0176181733
## PopDens PctUsePubTrans LemasPctOfficDrugUn
## -0.0321320775 -0.0601713493 0.0376963355
Now, plot the coefficient paths.
plot(m1, label=T)
plot(m1, label=T, xvar = "lambda")
Next, we need to decide which Lasso model to pick for prediction. Use Cross-Validation for this purpose.
m1_cv <- cv.glmnet(X, y, alpha = 1)
m1_cv
##
## Call: cv.glmnet(x = X, y = y, alpha = 1)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.000788 59 0.01866 0.001356 55
## 1se 0.015461 27 0.01995 0.001353 10
And plot the Cross-validation results.
plot(m1_cv)
In your own words, briefly describe the CV plot. (1) What is plotted here, (2) what can you infer about the relation between the number of variables and prediction accuracy?
Now, store the lambda value of the model with the smallest CV error
as bestlam1.
bestlam1 <- m1_cv$lambda.min
bestlam1
## [1] 0.000787604
Create bestlam2 as the lambda according to the
1-standard error rule.
bestlam2 <- m1_cv$lambda.1se
bestlam2
## [1] 0.01546099
Finally, we investigate the performance of our models in the test set. For this task, construct a X matrix from the test set.
Xt <- model.matrix(object = ViolentCrimesPerPop ~ . - state - communityname - fold,
data = crime_test)[, -1]
Use the predict function to generate predicted values
for both models (i.e., both lambdas stored earlier).
p_1 <- predict(m1, s = bestlam1, newx = Xt)
p_2 <- predict(m1, s = bestlam2, newx = Xt)
Compute the test MSE of our models.
mean((p_1 - crime_test$ViolentCrimesPerPop)^2)
## [1] 0.01856262
mean((p_2 - crime_test$ViolentCrimesPerPop)^2)
## [1] 0.01970093
In addition, use another performance metric and compute the corresponding values for both models.
postResample(pred = p_1, obs = crime_test$ViolentCrimesPerPop)
## RMSE Rsquared MAE
## 0.13624470 0.64810959 0.09419406
postResample(pred = p_2, obs = crime_test$ViolentCrimesPerPop)
## RMSE Rsquared MAE
## 0.14035998 0.62727376 0.09756406
Which model is better? Does it depend on the performance measure that is used?