Lab 3 (Data Preprocessing)

load("C:/Users/dipsy/Downloads/exampleData.rData")

str("exampleData")

##  chr "exampleData"

any(is.na("exampleData"))

## [1] FALSE

sum(is.na("exampleData"))

## [1] 0

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

str(custdata)

## 'data.frame':    1000 obs. of  19 variables:
##  $ state.of.res    : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ custid          : int  1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
##  $ sex             : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
##  $ is.employed     : logi  TRUE NA NA NA TRUE FALSE ...
##  $ income          : int  82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
##  $ marital.stat    : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
##  $ health.ins      : logi  TRUE TRUE TRUE TRUE FALSE TRUE ...
##  $ housing.type    : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
##  $ recent.move     : logi  FALSE FALSE FALSE FALSE FALSE TRUE ...
##  $ num.vehicles    : int  2 2 2 1 4 1 1 1 0 6 ...
##  $ age             : num  43 77 46 62 37 54 70 33 89 50 ...
##  $ is.employed.fix1: chr  "employed" "missing" "missing" "missing" ...
##  $ age.normalized  : num  -0.461 1.341 -0.302 0.546 -0.779 ...
##  $ Median.Income   : num  52371 52371 52371 52371 52371 ...
##  $ income.norm     : num  1.566 0.936 0.134 0.71 1.337 ...
##  $ gp              : num  0.935 0.116 0.991 0.187 0.849 ...
##  $ income.lt.30K   : logi  FALSE FALSE TRUE FALSE FALSE TRUE ...
##  $ age.range       : Factor w/ 3 levels "[0,25]","(25,65]",..: 2 3 2 2 2 2 3 2 3 2 ...
##  $ Income          : num  NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...

names(custdata)

##  [1] "state.of.res"     "custid"           "sex"              "is.employed"     
##  [5] "income"           "marital.stat"     "health.ins"       "housing.type"    
##  [9] "recent.move"      "num.vehicles"     "age"              "is.employed.fix1"
## [13] "age.normalized"   "Median.Income"    "income.norm"      "gp"              
## [17] "income.lt.30K"    "age.range"        "Income"

colSums(is.na(custdata))

##     state.of.res           custid              sex      is.employed 
##                0                0                0              328 
##           income     marital.stat       health.ins     housing.type 
##                0                0                0               56 
##      recent.move     num.vehicles              age is.employed.fix1 
##               56               56                0                0 
##   age.normalized    Median.Income      income.norm               gp 
##                0                0                0                0 
##    income.lt.30K        age.range           Income 
##                0                0              328

custdata[!complete.cases(custdata), ]

custdata$is.employed[is.na(custdata$is.employed)] <- FALSE

custdata$Income[is.na(custdata$Income)] <- mean(custdata$Income, na.rm = TRUE)

sum(is.na(custdata))

## [1] 168

colSums(is.na(custdata))

##     state.of.res           custid              sex      is.employed 
##                0                0                0                0 
##           income     marital.stat       health.ins     housing.type 
##                0                0                0               56 
##      recent.move     num.vehicles              age is.employed.fix1 
##               56               56                0                0 
##   age.normalized    Median.Income      income.norm               gp 
##                0                0                0                0 
##    income.lt.30K        age.range           Income 
##                0                0                0

missing_values <- colSums(is.na(custdata))
data.frame(Attribute = names(missing_values), Missing_Values = missing_values)

colSums(is.na(custdata))

##     state.of.res           custid              sex      is.employed 
##                0                0                0                0 
##           income     marital.stat       health.ins     housing.type 
##                0                0                0               56 
##      recent.move     num.vehicles              age is.employed.fix1 
##               56               56                0                0 
##   age.normalized    Median.Income      income.norm               gp 
##                0                0                0                0 
##    income.lt.30K        age.range           Income 
##                0                0                0

custdata_clean <- custdata[complete.cases(custdata), ]

nrow(custdata)

## [1] 1000

nrow(custdata_clean)

## [1] 944

colSums(is.na(custdata_clean))

##     state.of.res           custid              sex      is.employed 
##                0                0                0                0 
##           income     marital.stat       health.ins     housing.type 
##                0                0                0                0 
##      recent.move     num.vehicles              age is.employed.fix1 
##                0                0                0                0 
##   age.normalized    Median.Income      income.norm               gp 
##                0                0                0                0 
##    income.lt.30K        age.range           Income 
##                0                0                0

load("C:/Users/dipsy/Downloads/exampleData.rData")

sum(is.na(custdata$is.employed))

## [1] 328

custdata$is_employed_AddMissing <- custdata$is.employed

custdata$is_employed_AddMissing[is.na(custdata$is_employed_AddMissing)] <- "missing"

custdata$is_employed_AddMissing <- as.factor(custdata$is_employed_AddMissing)

str(custdata$is_employed_AddMissing)

##  Factor w/ 3 levels "FALSE","missing",..: 3 2 2 2 3 1 3 3 2 2 ...

table(custdata$is_employed_AddMissing)

## 
##   FALSE missing    TRUE 
##      73     328     599

sum(is.na(custdata$Income))

## [1] 328

custdata$Income_MissingfixedByZeroes <- custdata$Income

custdata$Income_MissingfixedByZeroes[is.na(custdata$Income_MissingfixedByZeroes)] <- 0

sum(is.na(custdata$Income_MissingfixedByZeroes))

## [1] 0

summary(custdata$Income_MissingfixedByZeroes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0   25250   44486   60000  615000

sum(is.na(custdata$Income))

## [1] 328

mean_income <- mean(custdata$Income, na.rm = TRUE)

custdata$Income_fixedByMean <- custdata$Income

custdata$Income_fixedByMean[is.na(custdata$Income_fixedByMean)] <- mean_income

sum(is.na(custdata$Income_fixedByMean))

## [1] 0

custdata_merged <- merge(custdata, medianincome,
                         by.x = "state.of.res",
                         by.y = "State")

str(custdata_merged)

## 'data.frame':    1000 obs. of  23 variables:
##  $ state.of.res               : Factor w/ 50 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ custid                     : int  1063014 1192089 16551 1079878 502705 674271 15917 467335 462569 1216026 ...
##  $ sex                        : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 2 2 2 ...
##  $ is.employed                : logi  TRUE NA NA NA TRUE FALSE ...
##  $ income                     : int  82000 49000 7000 37200 70000 0 24000 42600 22000 9600 ...
##  $ marital.stat               : Factor w/ 4 levels "Divorced/Separated",..: 2 2 2 1 2 2 1 3 4 3 ...
##  $ health.ins                 : logi  TRUE TRUE TRUE TRUE FALSE TRUE ...
##  $ housing.type               : Factor w/ 4 levels "Homeowner free and clear",..: 4 1 2 2 4 4 1 4 1 4 ...
##  $ recent.move                : logi  FALSE FALSE FALSE FALSE FALSE TRUE ...
##  $ num.vehicles               : int  2 2 2 1 4 1 1 1 0 6 ...
##  $ age                        : num  43 77 46 62 37 54 70 33 89 50 ...
##  $ is.employed.fix1           : chr  "employed" "missing" "missing" "missing" ...
##  $ age.normalized             : num  -0.461 1.341 -0.302 0.546 -0.779 ...
##  $ Median.Income.x            : num  52371 52371 52371 52371 52371 ...
##  $ income.norm                : num  1.566 0.936 0.134 0.71 1.337 ...
##  $ gp                         : num  0.935 0.116 0.991 0.187 0.849 ...
##  $ income.lt.30K              : logi  FALSE FALSE TRUE FALSE FALSE TRUE ...
##  $ age.range                  : Factor w/ 3 levels "[0,25]","(25,65]",..: 2 3 2 2 2 2 3 2 3 2 ...
##  $ Income                     : num  NA NA 4500 20000 12000 180000 120000 40000 NA 24000 ...
##  $ is_employed_AddMissing     : Factor w/ 3 levels "FALSE","missing",..: 3 2 2 2 3 1 3 3 2 2 ...
##  $ Income_MissingfixedByZeroes: num  0 0 4500 20000 12000 180000 120000 40000 0 24000 ...
##  $ Income_fixedByMean         : num  66199 66199 4500 20000 12000 ...
##  $ Median.Income.y            : num  52371 52371 52371 52371 52371 ...

head(custdata_merged)

dim(custdata_merged)

## [1] 1000   23

summary(custdata_merged[, c("state.of.res", "Income", "Median.Income.y")])

##        state.of.res     Income       Median.Income.y
##  California  :114   Min.   :     0   Min.   :37427  
##  New York    : 94   1st Qu.: 25000   1st Qu.:44819  
##  Pennsylvania: 63   Median : 45000   Median :50118  
##  Ohio        : 59   Mean   : 66199   Mean   :50919  
##  Illinois    : 52   3rd Qu.: 82000   3rd Qu.:55534  
##  Texas       : 51   Max.   :615000   Max.   :68187  
##  (Other)     :567   NA's   :328

names(custdata_merged)

##  [1] "state.of.res"                "custid"                     
##  [3] "sex"                         "is.employed"                
##  [5] "income"                      "marital.stat"               
##  [7] "health.ins"                  "housing.type"               
##  [9] "recent.move"                 "num.vehicles"               
## [11] "age"                         "is.employed.fix1"           
## [13] "age.normalized"              "Median.Income.x"            
## [15] "income.norm"                 "gp"                         
## [17] "income.lt.30K"               "age.range"                  
## [19] "Income"                      "is_employed_AddMissing"     
## [21] "Income_MissingfixedByZeroes" "Income_fixedByMean"         
## [23] "Median.Income.y"

custdata_merged$income_normed <- custdata_merged$income / custdata_merged$Median.Income.y
summary(custdata_merged$income_normed)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1956  0.2812  0.6712  1.0781  1.3508 11.7870

custdata <- custdata %>%
  mutate(
    Income_lt20K = income < 20000
  )


custdata <- custdata %>%
  mutate(
    Age_range = cut(
      age,
      breaks = c(0, 25, 65, Inf),
      include.lowest = TRUE,
      right = TRUE,
      labels = c("[0,25]", "(25,65]", "(65,Inf]")
    )
  )


table(custdata$Income_lt20K, useNA = "ifany")

## 
## FALSE  TRUE 
##   678   322

table(custdata$Age_range, useNA = "ifany")

## 
##   [0,25]  (25,65] (65,Inf] 
##       56      732      212

#C. Principal Component Analysis: (10 points)

custdata_num <- custdata %>%
  select(where(is.numeric))

custdata_num_complete <- na.omit(custdata_num)

pca_result <- prcomp(custdata_num_complete, center = TRUE, scale. = TRUE)

loadings <- pca_result$rotation
print(loadings)

##                                     PC1         PC2          PC3         PC4
## custid                       0.01142428  0.03302662 -0.037399481 -0.72826416
## income                      -0.03223412  0.60547968 -0.336305870  0.02510335
## num.vehicles                 0.07301064  0.08409691 -0.174797805 -0.27518099
## age                         -0.09019248  0.34943223  0.601107986 -0.03506363
## age.normalized              -0.09019248  0.34943223  0.601107986 -0.03506363
## Median.Income                0.03758558 -0.03719228  0.079758118 -0.13176205
## income.norm                 -0.03128580  0.60744578 -0.340945226  0.04675630
## gp                          -0.03987057  0.02937951 -0.005959855  0.60690070
## Income                       0.56957569  0.05712197  0.056695747  0.03131448
## Income_MissingfixedByZeroes  0.56957569  0.05712197  0.056695747  0.03131448
## Income_fixedByMean           0.56957569  0.05712197  0.056695747  0.03131448
##                                     PC5         PC6         PC7           PC8
## custid                      -0.20872179  0.29443959  0.58029047  0.0038512352
## income                       0.09321076 -0.11088206  0.07045078 -0.7019096223
## num.vehicles                 0.22302808  0.66746354 -0.62132525  0.0049139288
## age                         -0.01538148  0.05912184 -0.05889997 -0.0009884328
## age.normalized              -0.01538148  0.05912184 -0.05889997 -0.0009884328
## Median.Income                0.94389958 -0.12316213  0.24254164  0.0924104975
## income.norm                 -0.03178884 -0.10198053  0.03837326  0.7061996620
## gp                           0.05674514  0.65009478  0.45092859  0.0018337784
## Income                      -0.02801971 -0.01032755  0.02366876 -0.0026409539
## Income_MissingfixedByZeroes -0.02801971 -0.01032755  0.02366876 -0.0026409539
## Income_fixedByMean          -0.02801971 -0.01032755  0.02366876 -0.0026409539
##                                       PC9          PC10          PC11
## custid                       5.645814e-18  0.000000e+00  0.000000e+00
## income                       9.268621e-18  4.863870e-17 -4.421700e-17
## num.vehicles                -6.180119e-17 -1.312365e-17  1.532063e-17
## age                          7.071068e-01  2.113380e-17 -1.082852e-16
## age.normalized              -7.071068e-01 -5.129245e-17  3.251755e-17
## Median.Income                1.745875e-17 -5.807064e-18  2.387798e-17
## income.norm                 -7.496433e-17  1.142673e-17  2.405494e-17
## gp                           1.191739e-17 -3.375786e-18  4.078414e-18
## Income                       1.501805e-16 -8.164966e-01  9.392783e-17
## Income_MissingfixedByZeroes  3.915815e-17  4.082483e-01 -7.071068e-01
## Income_fixedByMean           3.915815e-17  4.082483e-01  7.071068e-01

scree_var <- pca_result$sdev^2
plot(
  scree_var,
  type = "b",
  pch = 19,
  xlab = "Principal Component",
  ylab = "Eigenvalue / Variance",
  main = "Scree Plot"
)

abline(h = 1, col = "red", lty = 2)

pca_summary <- summary(pca_result)
print(pca_summary)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.7421 1.4202 1.4126 1.03662 1.00222 0.99191 0.93699
## Proportion of Variance 0.2759 0.1834 0.1814 0.09769 0.09131 0.08944 0.07981
## Cumulative Proportion  0.2759 0.4592 0.6407 0.73834 0.82966 0.91910 0.99891
##                            PC8      PC9     PC10      PC11
## Standard deviation     0.10935 2.15e-16 3.01e-17 5.788e-33
## Proportion of Variance 0.00109 0.00e+00 0.00e+00 0.000e+00
## Cumulative Proportion  1.00000 1.00e+00 1.00e+00 1.000e+00

variance_table <- data.frame(
  Principal_Component = paste0("PC", 1:length(pca_result$sdev)),
  Standard_Deviation = pca_result$sdev,
  Proportion_of_Variance = (pca_result$sdev^2) / sum(pca_result$sdev^2),
  Cumulative_Proportion = cumsum((pca_result$sdev^2) / sum(pca_result$sdev^2))
)

print(variance_table)

##    Principal_Component Standard_Deviation Proportion_of_Variance
## 1                  PC1       1.742091e+00           2.758984e-01
## 2                  PC2       1.420184e+00           1.833566e-01
## 3                  PC3       1.412583e+00           1.813993e-01
## 4                  PC4       1.036617e+00           9.768862e-02
## 5                  PC5       1.002217e+00           9.131255e-02
## 6                  PC6       9.919127e-01           8.944462e-02
## 7                  PC7       9.369857e-01           7.981293e-02
## 8                  PC8       1.093513e-01           1.087063e-03
## 9                  PC9       2.149835e-16           4.201629e-33
## 10                PC10       3.009859e-17           8.235682e-35
## 11                PC11       5.787846e-33           3.045378e-66
##    Cumulative_Proportion
## 1              0.2758984
## 2              0.4592549
## 3              0.6406542
## 4              0.7383428
## 5              0.8296554
## 6              0.9191000
## 7              0.9989129
## 8              1.0000000
## 9              1.0000000
## 10             1.0000000
## 11             1.0000000

variance_table[, sapply(variance_table, is.numeric)] <-
  round(variance_table[, sapply(variance_table, is.numeric)], 4)

Lab 3 (Data Preprocessing)

Dipsy

2026-03-09