load("C:/Users/dipsy/Downloads/exampleData.rData")
str("exampleData")
## chr "exampleData"
any(is.na("exampleData"))
## [1] FALSE
sum(is.na("exampleData"))
## [1] 0
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(custdata)
## 'data.frame': 1000 obs. of 19 variables:
## $ state.of.res : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ custid : int 1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
## $ is.employed : logi TRUE NA NA NA TRUE FALSE ...
## $ income : int 82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
## $ marital.stat : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
## $ health.ins : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ housing.type : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
## $ recent.move : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ num.vehicles : int 2 2 2 1 4 1 1 1 0 6 ...
## $ age : num 43 77 46 62 37 54 70 33 89 50 ...
## $ is.employed.fix1: chr "employed" "missing" "missing" "missing" ...
## $ age.normalized : num -0.461 1.341 -0.302 0.546 -0.779 ...
## $ Median.Income : num 52371 52371 52371 52371 52371 ...
## $ income.norm : num 1.566 0.936 0.134 0.71 1.337 ...
## $ gp : num 0.935 0.116 0.991 0.187 0.849 ...
## $ income.lt.30K : logi FALSE FALSE TRUE FALSE FALSE TRUE ...
## $ age.range : Factor w/ 3 levels "[0,25]","(25,65]",..: 2 3 2 2 2 2 3 2 3 2 ...
## $ Income : num NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
names(custdata)
## [1] "state.of.res" "custid" "sex" "is.employed"
## [5] "income" "marital.stat" "health.ins" "housing.type"
## [9] "recent.move" "num.vehicles" "age" "is.employed.fix1"
## [13] "age.normalized" "Median.Income" "income.norm" "gp"
## [17] "income.lt.30K" "age.range" "Income"
colSums(is.na(custdata))
## state.of.res custid sex is.employed
## 0 0 0 328
## income marital.stat health.ins housing.type
## 0 0 0 56
## recent.move num.vehicles age is.employed.fix1
## 56 56 0 0
## age.normalized Median.Income income.norm gp
## 0 0 0 0
## income.lt.30K age.range Income
## 0 0 328
custdata[!complete.cases(custdata), ]
custdata$is.employed[is.na(custdata$is.employed)] <- FALSE
custdata$Income[is.na(custdata$Income)] <- mean(custdata$Income, na.rm = TRUE)
sum(is.na(custdata))
## [1] 168
colSums(is.na(custdata))
## state.of.res custid sex is.employed
## 0 0 0 0
## income marital.stat health.ins housing.type
## 0 0 0 56
## recent.move num.vehicles age is.employed.fix1
## 56 56 0 0
## age.normalized Median.Income income.norm gp
## 0 0 0 0
## income.lt.30K age.range Income
## 0 0 0
missing_values <- colSums(is.na(custdata))
data.frame(Attribute = names(missing_values), Missing_Values = missing_values)
colSums(is.na(custdata))
## state.of.res custid sex is.employed
## 0 0 0 0
## income marital.stat health.ins housing.type
## 0 0 0 56
## recent.move num.vehicles age is.employed.fix1
## 56 56 0 0
## age.normalized Median.Income income.norm gp
## 0 0 0 0
## income.lt.30K age.range Income
## 0 0 0
custdata_clean <- custdata[complete.cases(custdata), ]
nrow(custdata)
## [1] 1000
nrow(custdata_clean)
## [1] 944
colSums(is.na(custdata_clean))
## state.of.res custid sex is.employed
## 0 0 0 0
## income marital.stat health.ins housing.type
## 0 0 0 0
## recent.move num.vehicles age is.employed.fix1
## 0 0 0 0
## age.normalized Median.Income income.norm gp
## 0 0 0 0
## income.lt.30K age.range Income
## 0 0 0
load("C:/Users/dipsy/Downloads/exampleData.rData")
sum(is.na(custdata$is.employed))
## [1] 328
custdata$is_employed_AddMissing <- custdata$is.employed
custdata$is_employed_AddMissing[is.na(custdata$is_employed_AddMissing)] <- "missing"
custdata$is_employed_AddMissing <- as.factor(custdata$is_employed_AddMissing)
str(custdata$is_employed_AddMissing)
## Factor w/ 3 levels "FALSE","missing",..: 3 2 2 2 3 1 3 3 2 2 ...
table(custdata$is_employed_AddMissing)
##
## FALSE missing TRUE
## 73 328 599
sum(is.na(custdata$Income))
## [1] 328
custdata$Income_MissingfixedByZeroes <- custdata$Income
custdata$Income_MissingfixedByZeroes[is.na(custdata$Income_MissingfixedByZeroes)] <- 0
sum(is.na(custdata$Income_MissingfixedByZeroes))
## [1] 0
summary(custdata$Income_MissingfixedByZeroes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 25250 44486 60000 615000
sum(is.na(custdata$Income))
## [1] 328
mean_income <- mean(custdata$Income, na.rm = TRUE)
custdata$Income_fixedByMean <- custdata$Income
custdata$Income_fixedByMean[is.na(custdata$Income_fixedByMean)] <- mean_income
sum(is.na(custdata$Income_fixedByMean))
## [1] 0
custdata_merged <- merge(custdata, medianincome,
by.x = "state.of.res",
by.y = "State")
str(custdata_merged)
## 'data.frame': 1000 obs. of 23 variables:
## $ state.of.res : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ custid : int 1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
## $ is.employed : logi TRUE NA NA NA TRUE FALSE ...
## $ income : int 82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
## $ marital.stat : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
## $ health.ins : logi TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ housing.type : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
## $ recent.move : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ num.vehicles : int 2 2 2 1 4 1 1 1 0 6 ...
## $ age : num 43 77 46 62 37 54 70 33 89 50 ...
## $ is.employed.fix1 : chr "employed" "missing" "missing" "missing" ...
## $ age.normalized : num -0.461 1.341 -0.302 0.546 -0.779 ...
## $ Median.Income.x : num 52371 52371 52371 52371 52371 ...
## $ income.norm : num 1.566 0.936 0.134 0.71 1.337 ...
## $ gp : num 0.935 0.116 0.991 0.187 0.849 ...
## $ income.lt.30K : logi FALSE FALSE TRUE FALSE FALSE TRUE ...
## $ age.range : Factor w/ 3 levels "[0,25]","(25,65]",..: 2 3 2 2 2 2 3 2 3 2 ...
## $ Income : num NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
## $ is_employed_AddMissing : Factor w/ 3 levels "FALSE","missing",..: 3 2 2 2 3 1 3 3 2 2 ...
## $ Income_MissingfixedByZeroes: num 0 0 4500 20000 12000 180000 120000 40000 0 24000 ...
## $ Income_fixedByMean : num 66199 66199 4500 20000 12000 ...
## $ Median.Income.y : num 52371 52371 52371 52371 52371 ...
head(custdata_merged)
dim(custdata_merged)
## [1] 1000 23
summary(custdata_merged[, c("state.of.res", "Income", "Median.Income.y")])
## state.of.res Income Median.Income.y
## California :114 Min. : 0 Min. :37427
## New York : 94 1st Qu.: 25000 1st Qu.:44819
## Pennsylvania: 63 Median : 45000 Median :50118
## Ohio : 59 Mean : 66199 Mean :50919
## Illinois : 52 3rd Qu.: 82000 3rd Qu.:55534
## Texas : 51 Max. :615000 Max. :68187
## (Other) :567 NA's :328
names(custdata_merged)
## [1] "state.of.res" "custid"
## [3] "sex" "is.employed"
## [5] "income" "marital.stat"
## [7] "health.ins" "housing.type"
## [9] "recent.move" "num.vehicles"
## [11] "age" "is.employed.fix1"
## [13] "age.normalized" "Median.Income.x"
## [15] "income.norm" "gp"
## [17] "income.lt.30K" "age.range"
## [19] "Income" "is_employed_AddMissing"
## [21] "Income_MissingfixedByZeroes" "Income_fixedByMean"
## [23] "Median.Income.y"
custdata_merged$income_normed <- custdata_merged$income / custdata_merged$Median.Income.y
summary(custdata_merged$income_normed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1956 0.2812 0.6712 1.0781 1.3508 11.7870
custdata <- custdata %>%
mutate(
Income_lt20K = income < 20000
)
custdata <- custdata %>%
mutate(
Age_range = cut(
age,
breaks = c(0, 25, 65, Inf),
include.lowest = TRUE,
right = TRUE,
labels = c("[0,25]", "(25,65]", "(65,Inf]")
)
)
table(custdata$Income_lt20K, useNA = "ifany")
##
## FALSE TRUE
## 678 322
table(custdata$Age_range, useNA = "ifany")
##
## [0,25] (25,65] (65,Inf]
## 56 732 212
#C. Principal Component Analysis: (10 points)
custdata_num <- custdata %>%
select(where(is.numeric))
custdata_num_complete <- na.omit(custdata_num)
pca_result <- prcomp(custdata_num_complete, center = TRUE, scale. = TRUE)
loadings <- pca_result$rotation
print(loadings)
## PC1 PC2 PC3 PC4
## custid 0.01142428 0.03302662 -0.037399481 -0.72826416
## income -0.03223412 0.60547968 -0.336305870 0.02510335
## num.vehicles 0.07301064 0.08409691 -0.174797805 -0.27518099
## age -0.09019248 0.34943223 0.601107986 -0.03506363
## age.normalized -0.09019248 0.34943223 0.601107986 -0.03506363
## Median.Income 0.03758558 -0.03719228 0.079758118 -0.13176205
## income.norm -0.03128580 0.60744578 -0.340945226 0.04675630
## gp -0.03987057 0.02937951 -0.005959855 0.60690070
## Income 0.56957569 0.05712197 0.056695747 0.03131448
## Income_MissingfixedByZeroes 0.56957569 0.05712197 0.056695747 0.03131448
## Income_fixedByMean 0.56957569 0.05712197 0.056695747 0.03131448
## PC5 PC6 PC7 PC8
## custid -0.20872179 0.29443959 0.58029047 0.0038512352
## income 0.09321076 -0.11088206 0.07045078 -0.7019096223
## num.vehicles 0.22302808 0.66746354 -0.62132525 0.0049139288
## age -0.01538148 0.05912184 -0.05889997 -0.0009884328
## age.normalized -0.01538148 0.05912184 -0.05889997 -0.0009884328
## Median.Income 0.94389958 -0.12316213 0.24254164 0.0924104975
## income.norm -0.03178884 -0.10198053 0.03837326 0.7061996620
## gp 0.05674514 0.65009478 0.45092859 0.0018337784
## Income -0.02801971 -0.01032755 0.02366876 -0.0026409539
## Income_MissingfixedByZeroes -0.02801971 -0.01032755 0.02366876 -0.0026409539
## Income_fixedByMean -0.02801971 -0.01032755 0.02366876 -0.0026409539
## PC9 PC10 PC11
## custid 5.645814e-18 0.000000e+00 0.000000e+00
## income 9.268621e-18 4.863870e-17 -4.421700e-17
## num.vehicles -6.180119e-17 -1.312365e-17 1.532063e-17
## age 7.071068e-01 2.113380e-17 -1.082852e-16
## age.normalized -7.071068e-01 -5.129245e-17 3.251755e-17
## Median.Income 1.745875e-17 -5.807064e-18 2.387798e-17
## income.norm -7.496433e-17 1.142673e-17 2.405494e-17
## gp 1.191739e-17 -3.375786e-18 4.078414e-18
## Income 1.501805e-16 -8.164966e-01 9.392783e-17
## Income_MissingfixedByZeroes 3.915815e-17 4.082483e-01 -7.071068e-01
## Income_fixedByMean 3.915815e-17 4.082483e-01 7.071068e-01
scree_var <- pca_result$sdev^2
plot(
scree_var,
type = "b",
pch = 19,
xlab = "Principal Component",
ylab = "Eigenvalue / Variance",
main = "Scree Plot"
)
abline(h = 1, col = "red", lty = 2)
pca_summary <- summary(pca_result)
print(pca_summary)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7421 1.4202 1.4126 1.03662 1.00222 0.99191 0.93699
## Proportion of Variance 0.2759 0.1834 0.1814 0.09769 0.09131 0.08944 0.07981
## Cumulative Proportion 0.2759 0.4592 0.6407 0.73834 0.82966 0.91910 0.99891
## PC8 PC9 PC10 PC11
## Standard deviation 0.10935 2.15e-16 3.01e-17 5.788e-33
## Proportion of Variance 0.00109 0.00e+00 0.00e+00 0.000e+00
## Cumulative Proportion 1.00000 1.00e+00 1.00e+00 1.000e+00
variance_table <- data.frame(
Principal_Component = paste0("PC", 1:length(pca_result$sdev)),
Standard_Deviation = pca_result$sdev,
Proportion_of_Variance = (pca_result$sdev^2) / sum(pca_result$sdev^2),
Cumulative_Proportion = cumsum((pca_result$sdev^2) / sum(pca_result$sdev^2))
)
print(variance_table)
## Principal_Component Standard_Deviation Proportion_of_Variance
## 1 PC1 1.742091e+00 2.758984e-01
## 2 PC2 1.420184e+00 1.833566e-01
## 3 PC3 1.412583e+00 1.813993e-01
## 4 PC4 1.036617e+00 9.768862e-02
## 5 PC5 1.002217e+00 9.131255e-02
## 6 PC6 9.919127e-01 8.944462e-02
## 7 PC7 9.369857e-01 7.981293e-02
## 8 PC8 1.093513e-01 1.087063e-03
## 9 PC9 2.149835e-16 4.201629e-33
## 10 PC10 3.009859e-17 8.235682e-35
## 11 PC11 5.787846e-33 3.045378e-66
## Cumulative_Proportion
## 1 0.2758984
## 2 0.4592549
## 3 0.6406542
## 4 0.7383428
## 5 0.8296554
## 6 0.9191000
## 7 0.9989129
## 8 1.0000000
## 9 1.0000000
## 10 1.0000000
## 11 1.0000000
variance_table[, sapply(variance_table, is.numeric)] <-
round(variance_table[, sapply(variance_table, is.numeric)], 4)