load("/Users/alicia/Downloads/exampleData.rData")
ls()
## [1] "custdata"     "hhdata"       "medianincome"
names(custdata)
##  [1] "state.of.res"     "custid"           "sex"              "is.employed"     
##  [5] "income"           "marital.stat"     "health.ins"       "housing.type"    
##  [9] "recent.move"      "num.vehicles"     "age"              "is.employed.fix1"
## [13] "age.normalized"   "Median.Income"    "income.norm"      "gp"              
## [17] "income.lt.30K"    "age.range"        "Income"
custdata <- custdata[, !colnames(custdata) %in% c(
"is.employed.fix1",
"age.normalized",
"Median.Income",
"income.norm",
"gp",
"income.lt.30K",
"age.range"
)]
colSums(is.na(custdata))
## state.of.res       custid          sex  is.employed       income marital.stat 
##            0            0            0          328            0            0 
##   health.ins housing.type  recent.move num.vehicles          age       Income 
##            0           56           56           56            0          328
custdata <- na.omit(custdata)
colSums(is.na(custdata))
## state.of.res       custid          sex  is.employed       income marital.stat 
##            0            0            0            0            0            0 
##   health.ins housing.type  recent.move num.vehicles          age       Income 
##            0            0            0            0            0            0

A.d) using the original data set:

load("/Users/alicia/Downloads/exampleData.rData")
sum(is.na(custdata$is.employed))
## [1] 328
custdata$is_employed_AddMissing <- custdata$is.employed
custdata$is_employed_AddMissing[is.na(custdata$is_employed_AddMissing)] <- "missing"
custdata$is_employed_AddMissing <- as.factor(custdata$is_employed_AddMissing)

sum(is.na(custdata$Income))
## [1] 328
custdata$Income_MissingfixedByZeroes <- custdata$Income
custdata$Income_MissingfixedByZeroes[is.na(custdata$Income_MissingfixedByZeroes)] <- 0

summary(custdata$Income_MissingfixedByZeroes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0   25250   44486   60000  615000
mean_income <- mean(custdata$Income, na.rm = TRUE)
mean_income
## [1] 66198.67
custdata$Income_fixedByMean <- custdata$Income
custdata$Income_fixedByMean[is.na(custdata$Income_fixedByMean)] <- mean_income

summary(custdata$Income_fixedByMean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   35000   66199   66199   66199  615000

Part B: Data Transformations

custdata_merged <- merge(custdata,
                         medianincome,
                         by.x = "state.of.res",
                         by.y = "State")
head(custdata_merged)
##   state.of.res  custid sex is.employed income       marital.stat health.ins
## 1      Alabama 1063014   F        TRUE  82000            Married       TRUE
## 2      Alabama 1192089   M          NA  49000            Married       TRUE
## 3      Alabama   16551   F          NA   7000            Married       TRUE
## 4      Alabama 1079878   F          NA  37200 Divorced/Separated       TRUE
## 5      Alabama  502705   M        TRUE  70000            Married      FALSE
## 6      Alabama  674271   M       FALSE      0            Married       TRUE
##                   housing.type recent.move num.vehicles age is.employed.fix1
## 1                       Rented       FALSE            2  43         employed
## 2     Homeowner free and clear       FALSE            2  77          missing
## 3 Homeowner with mortgage/loan       FALSE            2  46          missing
## 4 Homeowner with mortgage/loan       FALSE            1  62          missing
## 5                       Rented       FALSE            4  37         employed
## 6                       Rented        TRUE            1  54     not employed
##   age.normalized Median.Income.x income.norm        gp income.lt.30K age.range
## 1     -0.4612000           52371   1.5657520 0.9350600         FALSE   (25,65]
## 2      1.3412291           52371   0.9356323 0.1162411         FALSE  (65,Inf]
## 3     -0.3021621           52371   0.1336618 0.9906832          TRUE   (25,65]
## 4      0.5460398           52371   0.7103168 0.1873560         FALSE   (25,65]
## 5     -0.7792757           52371   1.3366176 0.8490238         FALSE   (25,65]
## 6      0.1219389           52371   0.0000000 0.3295085          TRUE   (25,65]
##   Income is_employed_AddMissing Income_MissingfixedByZeroes Income_fixedByMean
## 1     NA                   TRUE                           0           66198.67
## 2     NA                missing                           0           66198.67
## 3   4500                missing                        4500            4500.00
## 4  20000                missing                       20000           20000.00
## 5  12000                   TRUE                       12000           12000.00
## 6 180000                  FALSE                      180000          180000.00
##   Median.Income.y
## 1           52371
## 2           52371
## 3           52371
## 4           52371
## 5           52371
## 6           52371
names(custdata_merged)
##  [1] "state.of.res"                "custid"                     
##  [3] "sex"                         "is.employed"                
##  [5] "income"                      "marital.stat"               
##  [7] "health.ins"                  "housing.type"               
##  [9] "recent.move"                 "num.vehicles"               
## [11] "age"                         "is.employed.fix1"           
## [13] "age.normalized"              "Median.Income.x"            
## [15] "income.norm"                 "gp"                         
## [17] "income.lt.30K"               "age.range"                  
## [19] "Income"                      "is_employed_AddMissing"     
## [21] "Income_MissingfixedByZeroes" "Income_fixedByMean"         
## [23] "Median.Income.y"
summary(custdata_merged[, c("state.of.res", "Income", "Median.Income.y")])
##        state.of.res     Income       Median.Income.y
##  California  :114   Min.   :     0   Min.   :37427  
##  New York    : 94   1st Qu.: 25000   1st Qu.:44819  
##  Pennsylvania: 63   Median : 45000   Median :50118  
##  Ohio        : 59   Mean   : 66199   Mean   :50919  
##  Illinois    : 52   3rd Qu.: 82000   3rd Qu.:55534  
##  Texas       : 51   Max.   :615000   Max.   :68187  
##  (Other)     :567   NA's   :328
custdata_merged$income_normed <- custdata_merged$income / custdata_merged$Median.Income.y
summary(custdata_merged$income_normed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1956  0.2812  0.6712  1.0781  1.3508 11.7870
custdata_merged$Income_lt20K <- custdata_merged$income < 20000
custdata_merged$Age_range <- cut(custdata_merged$age,
                                 breaks = c(0, 25, 65, Inf))

Part C: Principal Component Analysis

load("/Users/alicia/Downloads/exampleData.rData")
numeric_data <- custdata[sapply(custdata, is.numeric)]
numeric_data <- na.omit(numeric_data)
pca_result <- prcomp(numeric_data, scale = TRUE)
pca_result$rotation
##                         PC1         PC2          PC3          PC4         PC5
## custid         -0.008609599 -0.04980994  0.451580454  0.587134423  0.26177310
## income         -0.400517179 -0.56601375 -0.003895934 -0.058235748 -0.13490866
## num.vehicles    0.022971461 -0.20199678  0.419709217 -0.180527555  0.56032729
## age            -0.579297460  0.38897594  0.075934830 -0.007043807  0.06637725
## age.normalized -0.579297460  0.38897594  0.075934830 -0.007043807  0.06637725
## Median.Income   0.013242508  0.08271489  0.342597291 -0.549028425 -0.44617958
## income.norm    -0.400598655 -0.57091231 -0.046643100  0.009039828 -0.08119861
## gp             -0.037083761 -0.01341069 -0.447676639 -0.432926165  0.61071119
## Income          0.076047548 -0.02788700  0.537037276 -0.360906281  0.10889610
##                        PC6         PC7           PC8           PC9
## custid         -0.41584981 -0.45532632 -0.0038513085  6.483859e-18
## income         -0.03488429 -0.06110383  0.7018952246 -5.665401e-16
## num.vehicles   -0.13115213  0.64709492 -0.0049190436 -1.168655e-17
## age             0.03537803  0.04047140  0.0009898075 -7.071068e-01
## age.normalized  0.03537803  0.04047140  0.0009898075  7.071068e-01
## Median.Income  -0.59932590 -0.08584970 -0.0924112620  4.881995e-17
## income.norm     0.05465563 -0.05519960 -0.7061834828  5.114871e-16
## gp             -0.25702549 -0.41421238 -0.0018309020  1.934688e-17
## Income          0.61474145 -0.43005522  0.0079874689  2.225367e-17
screeplot(pca_result, type = "lines")

summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6     PC7
## Standard deviation     1.4272 1.4151 1.0606 1.0170 0.9931 0.9720 0.92651
## Proportion of Variance 0.2263 0.2225 0.1250 0.1149 0.1096 0.1050 0.09538
## Cumulative Proportion  0.2263 0.4488 0.5738 0.6887 0.7983 0.9033 0.99867
##                            PC8       PC9
## Standard deviation     0.10935 2.712e-16
## Proportion of Variance 0.00133 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00