HW3

LAB 3: DATA PREPROCESSING A: Cleaning, Finding, and handling missing values

load("/Users/alicia/Downloads/exampleData.rData")
names(custdata)

##  [1] "state.of.res"     "custid"           "sex"              "is.employed"     
##  [5] "income"           "marital.stat"     "health.ins"       "housing.type"    
##  [9] "recent.move"      "num.vehicles"     "age"              "is.employed.fix1"
## [13] "age.normalized"   "Median.Income"    "income.norm"      "gp"              
## [17] "income.lt.30K"    "age.range"        "Income"

A.a) removing features

custdata <- custdata[, !colnames(custdata) %in% c(
"is.employed.fix1",
"age.normalized",
"Median.Income",
"income.norm",
"gp",
"income.lt.30K",
"age.range"
)]

A.b) finding missing values

colSums(is.na(custdata))

## state.of.res       custid          sex  is.employed       income marital.stat 
##            0            0            0          328            0            0 
##   health.ins housing.type  recent.move num.vehicles          age       Income 
##            0           56           56           56            0          328

A.c) droping the customers with missing values

custdata <- na.omit(custdata)
colSums(is.na(custdata))

## state.of.res       custid          sex  is.employed       income marital.stat 
##            0            0            0            0            0            0 
##   health.ins housing.type  recent.move num.vehicles          age       Income 
##            0            0            0            0            0            0

A.d) using the original data set:

load("/Users/alicia/Downloads/exampleData.rData")

custdata$is_employed_AddMissing <- custdata$is.employed
custdata$is_employed_AddMissing[is.na(custdata$is_employed_AddMissing)] <- "missing"
custdata$is_employed_AddMissing <- as.factor(custdata$is_employed_AddMissing)

custdata$Income_MissingfixedByZeroes <- custdata$Income
custdata$Income_MissingfixedByZeroes[is.na(custdata$Income_MissingfixedByZeroes)] <- 0

summary(custdata$Income_MissingfixedByZeroes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0   25250   44486   60000  615000

mean_income <- mean(custdata$Income, na.rm = TRUE)
mean_income

## [1] 66198.67

custdata$Income_fixedByMean <- custdata$Income
custdata$Income_fixedByMean[is.na(custdata$Income_fixedByMean)] <- mean_income

summary(custdata$Income_fixedByMean)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0   35000   66199   66199   66199  615000

Part B: Data Transformations

custdata_merged <- merge(custdata,
                         medianincome,
                         by.x = "state.of.res",
                         by.y = "State")

summary(custdata_merged[, c("state.of.res", "Income", "Median.Income.y")])

##        state.of.res     Income       Median.Income.y
##  California  :114   Min.   :     0   Min.   :37427  
##  New York    : 94   1st Qu.: 25000   1st Qu.:44819  
##  Pennsylvania: 63   Median : 45000   Median :50118  
##  Ohio        : 59   Mean   : 66199   Mean   :50919  
##  Illinois    : 52   3rd Qu.: 82000   3rd Qu.:55534  
##  Texas       : 51   Max.   :615000   Max.   :68187  
##  (Other)     :567   NA's   :328

custdata_merged$income_normed <- custdata_merged$income / custdata_merged$Median.Income.y
summary(custdata_merged$income_normed)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1956  0.2812  0.6712  1.0781  1.3508 11.7870

custdata_merged$Income_lt20K <- custdata_merged$income < 20000

custdata_merged$Age_range <- cut(custdata_merged$age,
                                 breaks = c(0, 25, 65, Inf))

Part C: Principal Component Analysis

load("/Users/alicia/Downloads/exampleData.rData")
numeric_data <- custdata[sapply(custdata, is.numeric)]
numeric_data <- na.omit(numeric_data)
pca_result <- prcomp(numeric_data, scale = TRUE)

pca_result$rotation

##                         PC1         PC2          PC3          PC4         PC5
## custid         -0.008609599 -0.04980994  0.451580454  0.587134423  0.26177310
## income         -0.400517179 -0.56601375 -0.003895934 -0.058235748 -0.13490866
## num.vehicles    0.022971461 -0.20199678  0.419709217 -0.180527555  0.56032729
## age            -0.579297460  0.38897594  0.075934830 -0.007043807  0.06637725
## age.normalized -0.579297460  0.38897594  0.075934830 -0.007043807  0.06637725
## Median.Income   0.013242508  0.08271489  0.342597291 -0.549028425 -0.44617958
## income.norm    -0.400598655 -0.57091231 -0.046643100  0.009039828 -0.08119861
## gp             -0.037083761 -0.01341069 -0.447676639 -0.432926165  0.61071119
## Income          0.076047548 -0.02788700  0.537037276 -0.360906281  0.10889610
##                        PC6         PC7           PC8           PC9
## custid         -0.41584981 -0.45532632 -0.0038513085  6.483859e-18
## income         -0.03488429 -0.06110383  0.7018952246 -5.665401e-16
## num.vehicles   -0.13115213  0.64709492 -0.0049190436 -1.168655e-17
## age             0.03537803  0.04047140  0.0009898075 -7.071068e-01
## age.normalized  0.03537803  0.04047140  0.0009898075  7.071068e-01
## Median.Income  -0.59932590 -0.08584970 -0.0924112620  4.881995e-17
## income.norm     0.05465563 -0.05519960 -0.7061834828  5.114871e-16
## gp             -0.25702549 -0.41421238 -0.0018309020  1.934688e-17
## Income          0.61474145 -0.43005522  0.0079874689  2.225367e-17

screeplot(pca_result, type = "lines")

summary(pca_result)

## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6     PC7
## Standard deviation     1.4272 1.4151 1.0606 1.0170 0.9931 0.9720 0.92651
## Proportion of Variance 0.2263 0.2225 0.1250 0.1149 0.1096 0.1050 0.09538
## Cumulative Proportion  0.2263 0.4488 0.5738 0.6887 0.7983 0.9033 0.99867
##                            PC8       PC9
## Standard deviation     0.10935 2.712e-16
## Proportion of Variance 0.00133 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00

HW3

Alicia

2026-03-17