In the Midterm, we will use the 2015 U.S. Census County data to answer the following questions.
library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1 <U+221A> purrr 0.2.4
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.4
## <U+221A> tidyr 0.7.2 <U+221A> stringr 1.2.0
## <U+221A> readr 1.1.1 <U+221A> forcats 0.2.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data <- read.csv("/Users/zhengdongnanzi/Desktop/Columbia Spring 2018/GR 5058/Midterm/county_census_data2015.csv")
data <- as_tibble(data)
data
## # A tibble: 3,220 x 37
## CensusId State County TotalPop Men Women Hispanic White Black Native
## <int> <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alaba~ Autau~ 55221 26745 28476 2.60 75.8 18.5 0.400
## 2 1003 Alaba~ Baldw~ 195121 95314 99807 4.50 83.1 9.50 0.600
## 3 1005 Alaba~ Barbo~ 26932 14497 12435 4.60 46.2 46.7 0.200
## 4 1007 Alaba~ Bibb 22604 12073 10531 2.20 74.5 21.4 0.400
## 5 1009 Alaba~ Blount 57710 28512 29198 8.60 87.9 1.50 0.300
## 6 1011 Alaba~ Bullo~ 10678 5660 5018 4.40 22.2 70.7 1.20
## 7 1013 Alaba~ Butler 20354 9502 10852 1.20 53.3 43.8 0.100
## 8 1015 Alaba~ Calho~ 116648 56274 60374 3.50 73.0 20.3 0.200
## 9 1017 Alaba~ Chamb~ 34079 16258 17821 0.400 57.3 40.3 0.200
## 10 1019 Alaba~ Chero~ 26008 12975 13033 1.50 91.7 4.80 0.600
## # ... with 3,210 more rows, and 27 more variables: Asian <dbl>,
## # Pacific <dbl>, Citizen <int>, Income <dbl>, IncomeErr <dbl>,
## # IncomePerCap <int>, IncomePerCapErr <int>, Poverty <dbl>,
## # ChildPoverty <dbl>, Professional <dbl>, Service <dbl>, Office <dbl>,
## # Construction <dbl>, Production <dbl>, Drive <dbl>, Carpool <dbl>,
## # Transit <dbl>, Walk <dbl>, OtherTransp <dbl>, WorkAtHome <dbl>,
## # MeanCommute <dbl>, Employed <int>, PrivateWork <dbl>,
## # PublicWork <dbl>, SelfEmployed <dbl>, FamilyWork <dbl>,
## # Unemployment <dbl>
data1 <- select(data,County,TotalPop)
arrange(data1,desc(TotalPop))
## # A tibble: 3,220 x 2
## County TotalPop
## <fct> <int>
## 1 Los Angeles 10038388
## 2 Cook 5236393
## 3 Harris 4356362
## 4 Maricopa 4018143
## 5 San Diego 3223096
## 6 Orange 3116069
## 7 Miami-Dade 2639042
## 8 Kings 2595259
## 9 Dallas 2485003
## 10 Queens 2301139
## # ... with 3,210 more rows
Los Angeles, Cook, Harris, Maricopa and San Diego, these five counties, had the largest values for the variable signifying total county population.
data2 <- select(data,County,IncomePerCap)
data2 <- arrange(data2,desc(IncomePerCap))
tail(data2)
## # A tibble: 6 x 2
## County IncomePerCap
## <fct> <int>
## 1 "Comer\u00edo" 7076
## 2 Lajas 6974
## 3 "Las Mar\u00edas" 6792
## 4 Orocovis 6792
## 5 Ciales 6774
## 6 Maricao 5878
Maricao, Ciales, Orocovis, Las Mar0edas and Lajas, these five counties, had the smallest values for the variable signifying income per capita.
data3 <- select(data,County,Unemployment)
data3 <- arrange(data3,desc(Unemployment))
tail(data3)
## # A tibble: 6 x 2
## County Unemployment
## <fct> <dbl>
## 1 Sterling 0.300
## 2 Garfield 0.200
## 3 Kalawao 0
## 4 Thomas 0
## 5 Slope 0
## 6 Kenedy 0
Kenedy, Slope, Thomas, Kalawao and Garfield, these five counties, had the lowest unemployment rate.
summary(data$TotalPop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 85 11218 26035 99409 66430 10038388
summary(data$IncomePerCap)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5878 20238 23460 23982 27053 65600
summary(data$Unemployment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.500 7.600 8.094 9.900 36.500
data5 <- data
# (1) Create a new variable named Region
data5["Region"] <- NA
# (2) Using the column signifying state names to create a new categorical variable
# that signifies regions: NORTHEAST, MIDWEST, SOUTH and WEST
data5 <- mutate(data5,
Region = ifelse(State %in%
c("Connecticut","Maine","Massachusetts",
"New Hampshire","Rhode Island",
"Vermont","New Jersey","New York",
"Pennsylvania"),"NORTHEAST",
ifelse(State %in% c("Illinois","Indiana",
"Michigan","Ohio","Wisconsin",
"Iowa","Kansas","Minnesota",
"Missouri","Nebraska",
"North Dakota",
"South Dakota"), "MIDWEST",
ifelse(State %in% c("Delaware",
"District of Columbia",
"Florida",
"Georgia","Maryland",
"North Carolina",
"South Carolina",
"Virginia",
"West Virginia",
"Alabama",
"Kentucky",
"Mississippi",
"Tennessee", "Arkansas",
"Louisiana","Oklahoma",
"Texas"), "SOUTH",
ifelse(State %in% c("Arizona",
"Colorado",
"Idaho",
"Montana",
"Nevada",
"New Mexico",
"Utah",
"Wyoming",
"Alaska",
"California",
"Hawaii",
"Oregon",
"Washington"),
"WEST","NA")))))
data5
## # A tibble: 3,220 x 38
## CensusId State County TotalPop Men Women Hispanic White Black Native
## <int> <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alaba~ Autau~ 55221 26745 28476 2.60 75.8 18.5 0.400
## 2 1003 Alaba~ Baldw~ 195121 95314 99807 4.50 83.1 9.50 0.600
## 3 1005 Alaba~ Barbo~ 26932 14497 12435 4.60 46.2 46.7 0.200
## 4 1007 Alaba~ Bibb 22604 12073 10531 2.20 74.5 21.4 0.400
## 5 1009 Alaba~ Blount 57710 28512 29198 8.60 87.9 1.50 0.300
## 6 1011 Alaba~ Bullo~ 10678 5660 5018 4.40 22.2 70.7 1.20
## 7 1013 Alaba~ Butler 20354 9502 10852 1.20 53.3 43.8 0.100
## 8 1015 Alaba~ Calho~ 116648 56274 60374 3.50 73.0 20.3 0.200
## 9 1017 Alaba~ Chamb~ 34079 16258 17821 0.400 57.3 40.3 0.200
## 10 1019 Alaba~ Chero~ 26008 12975 13033 1.50 91.7 4.80 0.600
## # ... with 3,210 more rows, and 28 more variables: Asian <dbl>,
## # Pacific <dbl>, Citizen <int>, Income <dbl>, IncomeErr <dbl>,
## # IncomePerCap <int>, IncomePerCapErr <int>, Poverty <dbl>,
## # ChildPoverty <dbl>, Professional <dbl>, Service <dbl>, Office <dbl>,
## # Construction <dbl>, Production <dbl>, Drive <dbl>, Carpool <dbl>,
## # Transit <dbl>, Walk <dbl>, OtherTransp <dbl>, WorkAtHome <dbl>,
## # MeanCommute <dbl>, Employed <int>, PrivateWork <dbl>,
## # PublicWork <dbl>, SelfEmployed <dbl>, FamilyWork <dbl>,
## # Unemployment <dbl>, Region <chr>
data5new <- select(data5,County,Region)
head(data5new)
## # A tibble: 6 x 2
## County Region
## <fct> <chr>
## 1 Autauga SOUTH
## 2 Baldwin SOUTH
## 3 Barbour SOUTH
## 4 Bibb SOUTH
## 5 Blount SOUTH
## 6 Bullock SOUTH
# (3) Delete any states or territories that do not fall under this regional classification scheme
# (e.g.- Puerto Rico).
data5_1 <- data5[!grepl("NA", data5$Region),]
data5_1
## # A tibble: 3,142 x 38
## CensusId State County TotalPop Men Women Hispanic White Black Native
## <int> <fct> <fct> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alaba~ Autau~ 55221 26745 28476 2.60 75.8 18.5 0.400
## 2 1003 Alaba~ Baldw~ 195121 95314 99807 4.50 83.1 9.50 0.600
## 3 1005 Alaba~ Barbo~ 26932 14497 12435 4.60 46.2 46.7 0.200
## 4 1007 Alaba~ Bibb 22604 12073 10531 2.20 74.5 21.4 0.400
## 5 1009 Alaba~ Blount 57710 28512 29198 8.60 87.9 1.50 0.300
## 6 1011 Alaba~ Bullo~ 10678 5660 5018 4.40 22.2 70.7 1.20
## 7 1013 Alaba~ Butler 20354 9502 10852 1.20 53.3 43.8 0.100
## 8 1015 Alaba~ Calho~ 116648 56274 60374 3.50 73.0 20.3 0.200
## 9 1017 Alaba~ Chamb~ 34079 16258 17821 0.400 57.3 40.3 0.200
## 10 1019 Alaba~ Chero~ 26008 12975 13033 1.50 91.7 4.80 0.600
## # ... with 3,132 more rows, and 28 more variables: Asian <dbl>,
## # Pacific <dbl>, Citizen <int>, Income <dbl>, IncomeErr <dbl>,
## # IncomePerCap <int>, IncomePerCapErr <int>, Poverty <dbl>,
## # ChildPoverty <dbl>, Professional <dbl>, Service <dbl>, Office <dbl>,
## # Construction <dbl>, Production <dbl>, Drive <dbl>, Carpool <dbl>,
## # Transit <dbl>, Walk <dbl>, OtherTransp <dbl>, WorkAtHome <dbl>,
## # MeanCommute <dbl>, Employed <int>, PrivateWork <dbl>,
## # PublicWork <dbl>, SelfEmployed <dbl>, FamilyWork <dbl>,
## # Unemployment <dbl>, Region <chr>
# We can see that the total rows decrease from 3220 to 3142 due to the deletion
# (4) Create a chart that visualizes the relationship
# between the region variable and income per capita.
boxplot(IncomePerCap~Region,data=data5_1,
main="", xlab="Region", ylab="IncomePerCap")
Yes, there is a relationship between county regions and county income per capita.
The key statistical measures for the county income per capita are the highest in NORTHEAST region, second highest in MIDWEST region, a bit lower in WEST region and the lowest in SOUTH region shown by the plot (median, 1st and 3rd quartiles). However, there are overlaps for county income per capita in the boxplots amongest all the four regions.
chooseCRANmirror(graphics=FALSE, ind=1)
knitr::opts_chunk$set(echo = TRUE)
# (1)
subdata <- select(data,-c(CensusId,State,County))
subdata
## # A tibble: 3,220 x 34
## TotalPop Men Women Hispanic White Black Native Asian Pacific Citizen
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 55221 26745 28476 2.60 75.8 18.5 0.400 1.00 0 40725
## 2 195121 95314 99807 4.50 83.1 9.50 0.600 0.700 0 147695
## 3 26932 14497 12435 4.60 46.2 46.7 0.200 0.400 0 20714
## 4 22604 12073 10531 2.20 74.5 21.4 0.400 0.100 0 17495
## 5 57710 28512 29198 8.60 87.9 1.50 0.300 0.100 0 42345
## 6 10678 5660 5018 4.40 22.2 70.7 1.20 0.200 0 8057
## 7 20354 9502 10852 1.20 53.3 43.8 0.100 0.400 0 15581
## 8 116648 56274 60374 3.50 73.0 20.3 0.200 0.900 0 88612
## 9 34079 16258 17821 0.400 57.3 40.3 0.200 0.800 0 26462
## 10 26008 12975 13033 1.50 91.7 4.80 0.600 0.300 0 20600
## # ... with 3,210 more rows, and 24 more variables: Income <dbl>,
## # IncomeErr <dbl>, IncomePerCap <int>, IncomePerCapErr <int>,
## # Poverty <dbl>, ChildPoverty <dbl>, Professional <dbl>, Service <dbl>,
## # Office <dbl>, Construction <dbl>, Production <dbl>, Drive <dbl>,
## # Carpool <dbl>, Transit <dbl>, Walk <dbl>, OtherTransp <dbl>,
## # WorkAtHome <dbl>, MeanCommute <dbl>, Employed <int>,
## # PrivateWork <dbl>, PublicWork <dbl>, SelfEmployed <dbl>,
## # FamilyWork <dbl>, Unemployment <dbl>
# (2)
table(is.na(subdata))
##
## FALSE TRUE
## 109477 3
# We find some variables have missing values NA,
# Therefore, we need to delete them first in order to create the correlation plot
data6 <- na.omit(subdata)
cor.matrix <- round(cor(data6),2)
head(cor.matrix)
## TotalPop Men Women Hispanic White Black Native Asian Pacific
## TotalPop 1.00 1.00 1.00 0.11 -0.19 0.08 -0.05 0.45 0.06
## Men 1.00 1.00 1.00 0.11 -0.19 0.07 -0.05 0.45 0.06
## Women 1.00 1.00 1.00 0.11 -0.19 0.08 -0.05 0.45 0.06
## Hispanic 0.11 0.11 0.11 1.00 -0.73 -0.14 -0.06 0.05 0.00
## White -0.19 -0.19 -0.19 -0.73 1.00 -0.47 -0.23 -0.20 -0.09
## Black 0.08 0.07 0.08 -0.14 -0.47 1.00 -0.10 0.02 -0.05
## Citizen Income IncomeErr IncomePerCap IncomePerCapErr Poverty
## TotalPop 1.00 0.24 -0.26 0.24 -0.24 -0.06
## Men 1.00 0.24 -0.26 0.24 -0.24 -0.06
## Women 1.00 0.23 -0.26 0.24 -0.24 -0.06
## Hispanic 0.10 -0.22 -0.03 -0.29 -0.03 0.48
## White -0.18 0.27 0.08 0.35 0.10 -0.62
## Black 0.08 -0.22 -0.08 -0.21 -0.08 0.31
## ChildPoverty Professional Service Office Construction Production
## TotalPop -0.06 0.25 0.00 0.19 -0.26 -0.18
## Men -0.06 0.25 0.00 0.19 -0.26 -0.18
## Women -0.06 0.25 0.00 0.19 -0.27 -0.18
## Hispanic 0.40 -0.10 0.22 0.10 0.13 -0.19
## White -0.58 0.08 -0.32 -0.12 0.02 0.16
## Black 0.37 -0.13 0.15 0.10 -0.16 0.11
## Drive Carpool Transit Walk OtherTransp WorkAtHome MeanCommute
## TotalPop -0.10 -0.08 0.40 -0.05 0.04 -0.03 0.15
## Men -0.10 -0.08 0.40 -0.05 0.04 -0.03 0.15
## Women -0.11 -0.08 0.41 -0.05 0.04 -0.03 0.15
## Hispanic -0.02 0.07 0.09 0.00 0.04 -0.12 0.03
## White 0.08 -0.10 -0.18 -0.04 -0.17 0.22 -0.08
## Black 0.15 0.03 0.08 -0.17 0.00 -0.28 0.19
## Employed PrivateWork PublicWork SelfEmployed FamilyWork
## TotalPop 1.00 0.20 -0.14 -0.15 -0.09
## Men 1.00 0.20 -0.14 -0.15 -0.08
## Women 1.00 0.20 -0.14 -0.15 -0.09
## Hispanic 0.10 -0.17 0.21 0.00 -0.05
## White -0.18 0.23 -0.40 0.20 0.11
## Black 0.07 0.06 0.12 -0.31 -0.14
## Unemployment
## TotalPop 0.03
## Men 0.03
## Women 0.03
## Hispanic 0.32
## White -0.54
## Black 0.35
install.packages(("corrplot"))
##
## The downloaded binary packages are in
## /var/folders/zp/3g72mswd05d6nsdjf4g88dzc0000gn/T//RtmpMTEGCn/downloaded_packages
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor.matrix, type="upper",
order = "hclust", tl.col="black",
tl.srt=60, tl.cex = 0.5)
# subdata has missing values
table(is.na(subdata))
##
## FALSE TRUE
## 109477 3
# Delete it
data7 <- na.omit(subdata)
table(is.na(data7))
##
## FALSE
## 109412
# data6 does not have missing values since we removed it in Q6
table(is.na(data6))
##
## FALSE
## 109412
install.packages("factoextra")
##
## The downloaded binary packages are in
## /var/folders/zp/3g72mswd05d6nsdjf4g88dzc0000gn/T//RtmpMTEGCn/downloaded_packages
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# (1)
data8 <- select(data,IncomePerCap,Drive,Transit,Employed)
# First, examining the means of each variable in the dataset might be useful.
colMeans(select(data,IncomePerCap,Drive,Transit,Employed), na.rm = TRUE)
## IncomePerCap Drive Transit Employed
## 2.398177e+04 7.918193e+01 9.718323e-01 4.559352e+04
# (2)
data8 <- na.omit(data8)
data8 <- scale(data8)
set.seed(123)
km.out=kmeans(data8,4,nstart=20)
fviz_cluster(km.out, data = data8)
set.seed(123)
k4 <- kmeans(data8, 4, nstart = 20)
data8_1 <- select(data,IncomePerCap,Drive,Transit,Employed) %>%
mutate(Cluster = k4$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
data8_1 <- as.data.frame(data8_1)
data8_1
## Cluster IncomePerCap Drive Transit Employed
## 1 1 34909.08 39.12500 38.1750000 1102224.75
## 2 2 20445.48 81.77387 0.4543906 18661.81
## 3 3 35549.96 73.55403 6.6943548 439732.60
## 4 4 28196.80 76.11826 0.8193361 36507.93
# Compared with the means of each variable in the dataset
colMeans(select(data,IncomePerCap,Drive,Transit,Employed), na.rm = TRUE)
## IncomePerCap Drive Transit Employed
## 2.398177e+04 7.918193e+01 9.718323e-01 4.559352e+04
We know that
Therefore, to sum it up, comparing with the means of each variable in the dataset, the categorized county 3 (Cluster 3) above is most likely to purchase automobiles since it has the highest income per capita (incomePerCap), relative high % commuting alone in an automobile (Drive), relative low % commuting on public transportation (Transit) and relative high % employed (Employed).
# (1) Hierarchical clustering using Complete Linkage
hc1 <- hclust(dist(data8, method = "euclidean"), method = "complete" )
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1) #notice how row.names become labels
# cut off = 4
sub_grp <- cutree(hc1, k = 4)
fviz_cluster(list(data = data8, cluster = sub_grp))
# Number of members in each cluster
table(sub_grp)
## sub_grp
## 1 2 3 4
## 3210 5 1 4
library(dplyr)
data9 <- select(data,IncomePerCap,Drive,Transit,Employed) %>%
mutate(cluster = sub_grp) %>% group_by(cluster) %>% summarise_all("mean")
data9 <- as.data.frame(data9)
data9
## cluster IncomePerCap Drive Transit Employed
## 1 1 23956.98 79.26536 0.8910903 40223.51
## 2 2 30795.00 74.44000 5.8400000 1867493.00
## 3 3 28337.00 73.00000 6.8000000 4635465.00
## 4 4 34274.75 19.70000 58.2250000 930184.50
# Compared with the means of each variable in the dataset
colMeans(select(data,IncomePerCap,Drive,Transit,Employed), na.rm = TRUE)
## IncomePerCap Drive Transit Employed
## 2.398177e+04 7.918193e+01 9.718323e-01 4.559352e+04
The same as Question 8), we know that
Therefore, to sum it up, in the hierarchical clustering analysis, if we compared with the means of each variable in the dataset, the categorized county 3 (Cluster 3) above is most likely to purchase automobiles since it has relative high income per capita (incomePerCap), relative high % commuting alone in an automobile (Drive), relative low % commuting on public transportation (Transit) and the highest high % employed (Employed).
However, we also observe that the Hierarchical Cluster Analysis results in unbalanced groups due to the very small number of counties in one group and very large number in another. Therefore, the conclusions are not practically useful.
No, there is an authoritative standard that defines how many categories we should choose when using unsupervised learning for cluster analysis.
However, to decide which is the optimal number of clusters, we can use the Elbow method. The basic idea is to define clusters such that the total intra-cluster variation [or total within-cluster sum of square (WSS)] is minimized (reference: [link])(http://www.sthda.com/english/articles/29-cluster-validation-essentials/96-determining-the-optimal-number-of-clusters-3-must-know-methods/#elbow-method). The total WSS measures the compactness of clustering and we want it to be as small as possible. The Elbow method looks at the total WSS as a function of the number of clusters: One should choose a number of clusters so that adding another cluster does not improve much better the total WSS.
We can see the plot below that when the number of clusters is 5, adding the sixth cluster does not improve much better the total WSS. Therefore, we know that 5 clusters might be the optimal number of categories to explore our data.
set.seed(123)
fviz_nbclust(data8, kmeans, method = "wss")
data11 <- select(data,-c(CensusId,State,County))
install.packages(c("FactoMineR", "factoextra"))
##
## The downloaded binary packages are in
## /var/folders/zp/3g72mswd05d6nsdjf4g88dzc0000gn/T//RtmpMTEGCn/downloaded_packages
library("FactoMineR")
library("factoextra")
data11 <- na.omit(data11)
# (1)
pr.out=prcomp(data11, scale = TRUE)
# PC1
# the first variabe we create to summarize all off the continuous variables (Z1)
# It is a linear combination of all the continuous variables
# which captures the maximum variance in the data set.
# Z1 = 0.33*TotalPop + 0.33*Men + 0.33*Women - 0.024*Hispanic +...
pr.out$rotation[,1]
## TotalPop Men Women Hispanic
## 0.330752043 0.330441379 0.330974467 -0.024061490
## White Black Native Asian
## 0.008380334 -0.004109366 -0.056078567 0.235093614
## Pacific Citizen Income IncomeErr
## 0.042251109 0.335688957 0.231137759 -0.131003861
## IncomePerCap IncomePerCapErr Poverty ChildPoverty
## 0.230389361 -0.117602480 -0.150084875 -0.148328160
## Professional Service Office Construction
## 0.189022960 -0.068456387 0.114303605 -0.176490198
## Production Drive Carpool Transit
## -0.100541726 -0.033588008 -0.079840778 0.192695285
## Walk OtherTransp WorkAtHome MeanCommute
## -0.030609972 0.004027988 0.001394034 0.067344701
## Employed PrivateWork PublicWork SelfEmployed
## 0.335095433 0.156597320 -0.131764211 -0.088240164
## FamilyWork Unemployment
## -0.059934566 -0.076662240
# PC2
# the second variabe we create to summarize all off the continuous variables (Z2)
# It is also a linear combination of all the continuous variables
# which captures the remaining variance in the data set
# Z2 = -0.1*TotalPop - 0.1*Men - 0.10*Women - 0.166*Hispanic +...
pr.out$rotation[,2]
## TotalPop Men Women Hispanic
## -0.103508562 -0.103161314 -0.103819003 -0.165523391
## White Black Native Asian
## 0.250174329 -0.194185184 0.018138624 0.008230439
## Pacific Citizen Income IncomeErr
## 0.018814310 -0.103432118 0.232719102 0.196541337
## IncomePerCap IncomePerCapErr Poverty ChildPoverty
## 0.266159041 0.206344319 -0.308301279 -0.314047475
## Professional Service Office Construction
## 0.191871296 -0.164870180 -0.135659070 0.091728531
## Production Drive Carpool Transit
## -0.100559306 -0.169815267 -0.034153528 -0.016589424
## Walk OtherTransp WorkAtHome MeanCommute
## 0.152716758 0.021786111 0.265480143 -0.143659205
## Employed PrivateWork PublicWork SelfEmployed
## -0.095631656 -0.091018781 -0.039465071 0.231062875
## FamilyWork Unemployment
## 0.136413465 -0.302880292
# (2) 37%
pca <- PCA(data11, graph = FALSE)
fviz_pca_var(pca, col.var = "black")
pr.out$sdev
## [1] 2.611578e+00 2.411891e+00 2.121372e+00 1.461375e+00 1.317950e+00
## [6] 1.207224e+00 1.118453e+00 1.053711e+00 1.042308e+00 1.014071e+00
## [11] 9.541545e-01 9.164039e-01 8.710676e-01 8.380396e-01 7.568946e-01
## [16] 7.213067e-01 7.044556e-01 6.475992e-01 6.361367e-01 5.543831e-01
## [21] 5.387386e-01 5.354747e-01 5.075143e-01 4.211613e-01 2.413975e-01
## [26] 2.112988e-01 6.699417e-02 5.192168e-02 4.131990e-02 1.709239e-02
## [31] 6.804513e-03 5.928508e-03 5.182098e-03 5.139928e-15
pr.var=pr.out$sdev^2
pve=pr.var/sum(pr.var)
pve
## [1] 2.005982e-01 1.710946e-01 1.323594e-01 6.281225e-02 5.108803e-02
## [6] 4.286440e-02 3.679227e-02 3.265611e-02 3.195313e-02 3.024532e-02
## [11] 2.677679e-02 2.469989e-02 2.231643e-02 2.065619e-02 1.684969e-02
## [16] 1.530245e-02 1.459581e-02 1.233484e-02 1.190205e-02 9.039429e-03
## [21] 8.536448e-03 8.433327e-03 7.575612e-03 5.216965e-03 1.713905e-03
## [26] 1.313153e-03 1.320064e-04 7.929002e-05 5.021571e-05 8.592644e-06
## [31] 1.361806e-06 1.033741e-06 7.898278e-07 7.770253e-31
plot(pve, xlab="Principal Component",
ylab="Proportion of Variance Explained", ylim=c(0,1), type="b")
plot(cumsum(pve), xlab="Principal Component",
ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type="b")
# (3)
# Suppose IncomePerCap is the output variable,
# We can run regression Z1Z2 on IncomePerCa
Z1Z2 <- pr.out$x[,1:2]
lm(data11$IncomePerCap~Z1Z2)
##
## Call:
## lm(formula = data11$IncomePerCap ~ Z1Z2)
##
## Coefficients:
## (Intercept) Z1Z2PC1 Z1Z2PC2
## 23974 1427 1648
For Principal Components Analysis, the two variables we create to summarize all of the continuous variables in the dataset are PC1 (:Z1) and PC2 (:Z2).
These two variables explain 37% of the variance in the data.
In supervised learning, PCA can be used to perform regression models using the principal component scores as features (reference: link). For example, suppose we want to predict the income per capita (IncomePerCap). We can use the first two principal components (Z1 and Z2) as input variables and IncomePerCap as output variable for supervised learning, which could lead to a more robust model and predict the IncomePerCap better.