LAB 3: DATA PREPROCESSING A: Cleaning, Finding, and handling missing values
load("/Users/alicia/Downloads/exampleData.rData")
names(custdata)
## [1] "state.of.res" "custid" "sex" "is.employed"
## [5] "income" "marital.stat" "health.ins" "housing.type"
## [9] "recent.move" "num.vehicles" "age" "is.employed.fix1"
## [13] "age.normalized" "Median.Income" "income.norm" "gp"
## [17] "income.lt.30K" "age.range" "Income"
A.a) removing features
custdata <- custdata[, !colnames(custdata) %in% c(
"is.employed.fix1",
"age.normalized",
"Median.Income",
"income.norm",
"gp",
"income.lt.30K",
"age.range"
)]
A.b) finding missing values
colSums(is.na(custdata))
## state.of.res custid sex is.employed income marital.stat
## 0 0 0 328 0 0
## health.ins housing.type recent.move num.vehicles age Income
## 0 56 56 56 0 328
A.c) droping the customers with missing values
custdata <- na.omit(custdata)
colSums(is.na(custdata))
## state.of.res custid sex is.employed income marital.stat
## 0 0 0 0 0 0
## health.ins housing.type recent.move num.vehicles age Income
## 0 0 0 0 0 0
A.d) using the original data set:
load("/Users/alicia/Downloads/exampleData.rData")
custdata$is_employed_AddMissing <- custdata$is.employed
custdata$is_employed_AddMissing[is.na(custdata$is_employed_AddMissing)] <- "missing"
custdata$is_employed_AddMissing <- as.factor(custdata$is_employed_AddMissing)
custdata$Income_MissingfixedByZeroes <- custdata$Income
custdata$Income_MissingfixedByZeroes[is.na(custdata$Income_MissingfixedByZeroes)] <- 0
summary(custdata$Income_MissingfixedByZeroes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 25250 44486 60000 615000
mean_income <- mean(custdata$Income, na.rm = TRUE)
mean_income
## [1] 66198.67
custdata$Income_fixedByMean <- custdata$Income
custdata$Income_fixedByMean[is.na(custdata$Income_fixedByMean)] <- mean_income
summary(custdata$Income_fixedByMean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 35000 66199 66199 66199 615000
Part B: Data Transformations
custdata_merged <- merge(custdata,
medianincome,
by.x = "state.of.res",
by.y = "State")
summary(custdata_merged[, c("state.of.res", "Income", "Median.Income.y")])
## state.of.res Income Median.Income.y
## California :114 Min. : 0 Min. :37427
## New York : 94 1st Qu.: 25000 1st Qu.:44819
## Pennsylvania: 63 Median : 45000 Median :50118
## Ohio : 59 Mean : 66199 Mean :50919
## Illinois : 52 3rd Qu.: 82000 3rd Qu.:55534
## Texas : 51 Max. :615000 Max. :68187
## (Other) :567 NA's :328
custdata_merged$income_normed <- custdata_merged$income / custdata_merged$Median.Income.y
summary(custdata_merged$income_normed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1956 0.2812 0.6712 1.0781 1.3508 11.7870
custdata_merged$Income_lt20K <- custdata_merged$income < 20000
custdata_merged$Age_range <- cut(custdata_merged$age,
breaks = c(0, 25, 65, Inf))
Part C: Principal Component Analysis
load("/Users/alicia/Downloads/exampleData.rData")
numeric_data <- custdata[sapply(custdata, is.numeric)]
numeric_data <- na.omit(numeric_data)
pca_result <- prcomp(numeric_data, scale = TRUE)
pca_result$rotation
## PC1 PC2 PC3 PC4 PC5
## custid -0.008609599 -0.04980994 0.451580454 0.587134423 0.26177310
## income -0.400517179 -0.56601375 -0.003895934 -0.058235748 -0.13490866
## num.vehicles 0.022971461 -0.20199678 0.419709217 -0.180527555 0.56032729
## age -0.579297460 0.38897594 0.075934830 -0.007043807 0.06637725
## age.normalized -0.579297460 0.38897594 0.075934830 -0.007043807 0.06637725
## Median.Income 0.013242508 0.08271489 0.342597291 -0.549028425 -0.44617958
## income.norm -0.400598655 -0.57091231 -0.046643100 0.009039828 -0.08119861
## gp -0.037083761 -0.01341069 -0.447676639 -0.432926165 0.61071119
## Income 0.076047548 -0.02788700 0.537037276 -0.360906281 0.10889610
## PC6 PC7 PC8 PC9
## custid -0.41584981 -0.45532632 -0.0038513085 6.483859e-18
## income -0.03488429 -0.06110383 0.7018952246 -5.665401e-16
## num.vehicles -0.13115213 0.64709492 -0.0049190436 -1.168655e-17
## age 0.03537803 0.04047140 0.0009898075 -7.071068e-01
## age.normalized 0.03537803 0.04047140 0.0009898075 7.071068e-01
## Median.Income -0.59932590 -0.08584970 -0.0924112620 4.881995e-17
## income.norm 0.05465563 -0.05519960 -0.7061834828 5.114871e-16
## gp -0.25702549 -0.41421238 -0.0018309020 1.934688e-17
## Income 0.61474145 -0.43005522 0.0079874689 2.225367e-17
screeplot(pca_result, type = "lines")
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4272 1.4151 1.0606 1.0170 0.9931 0.9720 0.92651
## Proportion of Variance 0.2263 0.2225 0.1250 0.1149 0.1096 0.1050 0.09538
## Cumulative Proportion 0.2263 0.4488 0.5738 0.6887 0.7983 0.9033 0.99867
## PC8 PC9
## Standard deviation 0.10935 2.712e-16
## Proportion of Variance 0.00133 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00