load("/Users/alicia/Downloads/exampleData.rData")
ls()
## [1] "custdata" "hhdata" "medianincome"
names(custdata)
## [1] "state.of.res" "custid" "sex" "is.employed"
## [5] "income" "marital.stat" "health.ins" "housing.type"
## [9] "recent.move" "num.vehicles" "age" "is.employed.fix1"
## [13] "age.normalized" "Median.Income" "income.norm" "gp"
## [17] "income.lt.30K" "age.range" "Income"
custdata <- custdata[, !colnames(custdata) %in% c(
"is.employed.fix1",
"age.normalized",
"Median.Income",
"income.norm",
"gp",
"income.lt.30K",
"age.range"
)]
colSums(is.na(custdata))
## state.of.res custid sex is.employed income marital.stat
## 0 0 0 328 0 0
## health.ins housing.type recent.move num.vehicles age Income
## 0 56 56 56 0 328
custdata <- na.omit(custdata)
colSums(is.na(custdata))
## state.of.res custid sex is.employed income marital.stat
## 0 0 0 0 0 0
## health.ins housing.type recent.move num.vehicles age Income
## 0 0 0 0 0 0
A.d) using the original data set:
load("/Users/alicia/Downloads/exampleData.rData")
sum(is.na(custdata$is.employed))
## [1] 328
custdata$is_employed_AddMissing <- custdata$is.employed
custdata$is_employed_AddMissing[is.na(custdata$is_employed_AddMissing)] <- "missing"
custdata$is_employed_AddMissing <- as.factor(custdata$is_employed_AddMissing)
sum(is.na(custdata$Income))
## [1] 328
custdata$Income_MissingfixedByZeroes <- custdata$Income
custdata$Income_MissingfixedByZeroes[is.na(custdata$Income_MissingfixedByZeroes)] <- 0
summary(custdata$Income_MissingfixedByZeroes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 25250 44486 60000 615000
mean_income <- mean(custdata$Income, na.rm = TRUE)
mean_income
## [1] 66198.67
custdata$Income_fixedByMean <- custdata$Income
custdata$Income_fixedByMean[is.na(custdata$Income_fixedByMean)] <- mean_income
summary(custdata$Income_fixedByMean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 35000 66199 66199 66199 615000
Part B: Data Transformations
custdata_merged <- merge(custdata,
medianincome,
by.x = "state.of.res",
by.y = "State")
head(custdata_merged)
## state.of.res custid sex is.employed income marital.stat health.ins
## 1 Alabama 1063014 F TRUE 82000 Married TRUE
## 2 Alabama 1192089 M NA 49000 Married TRUE
## 3 Alabama 16551 F NA 7000 Married TRUE
## 4 Alabama 1079878 F NA 37200 Divorced/Separated TRUE
## 5 Alabama 502705 M TRUE 70000 Married FALSE
## 6 Alabama 674271 M FALSE 0 Married TRUE
## housing.type recent.move num.vehicles age is.employed.fix1
## 1 Rented FALSE 2 43 employed
## 2 Homeowner free and clear FALSE 2 77 missing
## 3 Homeowner with mortgage/loan FALSE 2 46 missing
## 4 Homeowner with mortgage/loan FALSE 1 62 missing
## 5 Rented FALSE 4 37 employed
## 6 Rented TRUE 1 54 not employed
## age.normalized Median.Income.x income.norm gp income.lt.30K age.range
## 1 -0.4612000 52371 1.5657520 0.9350600 FALSE (25,65]
## 2 1.3412291 52371 0.9356323 0.1162411 FALSE (65,Inf]
## 3 -0.3021621 52371 0.1336618 0.9906832 TRUE (25,65]
## 4 0.5460398 52371 0.7103168 0.1873560 FALSE (25,65]
## 5 -0.7792757 52371 1.3366176 0.8490238 FALSE (25,65]
## 6 0.1219389 52371 0.0000000 0.3295085 TRUE (25,65]
## Income is_employed_AddMissing Income_MissingfixedByZeroes Income_fixedByMean
## 1 NA TRUE 0 66198.67
## 2 NA missing 0 66198.67
## 3 4500 missing 4500 4500.00
## 4 20000 missing 20000 20000.00
## 5 12000 TRUE 12000 12000.00
## 6 180000 FALSE 180000 180000.00
## Median.Income.y
## 1 52371
## 2 52371
## 3 52371
## 4 52371
## 5 52371
## 6 52371
names(custdata_merged)
## [1] "state.of.res" "custid"
## [3] "sex" "is.employed"
## [5] "income" "marital.stat"
## [7] "health.ins" "housing.type"
## [9] "recent.move" "num.vehicles"
## [11] "age" "is.employed.fix1"
## [13] "age.normalized" "Median.Income.x"
## [15] "income.norm" "gp"
## [17] "income.lt.30K" "age.range"
## [19] "Income" "is_employed_AddMissing"
## [21] "Income_MissingfixedByZeroes" "Income_fixedByMean"
## [23] "Median.Income.y"
summary(custdata_merged[, c("state.of.res", "Income", "Median.Income.y")])
## state.of.res Income Median.Income.y
## California :114 Min. : 0 Min. :37427
## New York : 94 1st Qu.: 25000 1st Qu.:44819
## Pennsylvania: 63 Median : 45000 Median :50118
## Ohio : 59 Mean : 66199 Mean :50919
## Illinois : 52 3rd Qu.: 82000 3rd Qu.:55534
## Texas : 51 Max. :615000 Max. :68187
## (Other) :567 NA's :328
custdata_merged$income_normed <- custdata_merged$income / custdata_merged$Median.Income.y
summary(custdata_merged$income_normed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1956 0.2812 0.6712 1.0781 1.3508 11.7870
custdata_merged$Income_lt20K <- custdata_merged$income < 20000
custdata_merged$Age_range <- cut(custdata_merged$age,
breaks = c(0, 25, 65, Inf))
Part C: Principal Component Analysis
load("/Users/alicia/Downloads/exampleData.rData")
numeric_data <- custdata[sapply(custdata, is.numeric)]
numeric_data <- na.omit(numeric_data)
pca_result <- prcomp(numeric_data, scale = TRUE)
pca_result$rotation
## PC1 PC2 PC3 PC4 PC5
## custid -0.008609599 -0.04980994 0.451580454 0.587134423 0.26177310
## income -0.400517179 -0.56601375 -0.003895934 -0.058235748 -0.13490866
## num.vehicles 0.022971461 -0.20199678 0.419709217 -0.180527555 0.56032729
## age -0.579297460 0.38897594 0.075934830 -0.007043807 0.06637725
## age.normalized -0.579297460 0.38897594 0.075934830 -0.007043807 0.06637725
## Median.Income 0.013242508 0.08271489 0.342597291 -0.549028425 -0.44617958
## income.norm -0.400598655 -0.57091231 -0.046643100 0.009039828 -0.08119861
## gp -0.037083761 -0.01341069 -0.447676639 -0.432926165 0.61071119
## Income 0.076047548 -0.02788700 0.537037276 -0.360906281 0.10889610
## PC6 PC7 PC8 PC9
## custid -0.41584981 -0.45532632 -0.0038513085 6.483859e-18
## income -0.03488429 -0.06110383 0.7018952246 -5.665401e-16
## num.vehicles -0.13115213 0.64709492 -0.0049190436 -1.168655e-17
## age 0.03537803 0.04047140 0.0009898075 -7.071068e-01
## age.normalized 0.03537803 0.04047140 0.0009898075 7.071068e-01
## Median.Income -0.59932590 -0.08584970 -0.0924112620 4.881995e-17
## income.norm 0.05465563 -0.05519960 -0.7061834828 5.114871e-16
## gp -0.25702549 -0.41421238 -0.0018309020 1.934688e-17
## Income 0.61474145 -0.43005522 0.0079874689 2.225367e-17
screeplot(pca_result, type = "lines")
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4272 1.4151 1.0606 1.0170 0.9931 0.9720 0.92651
## Proportion of Variance 0.2263 0.2225 0.1250 0.1149 0.1096 0.1050 0.09538
## Cumulative Proportion 0.2263 0.4488 0.5738 0.6887 0.7983 0.9033 0.99867
## PC8 PC9
## Standard deviation 0.10935 2.712e-16
## Proportion of Variance 0.00133 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00