a <-read_csv("https://www2.census.gov/programs-surveys/popest/tables/2010-2015/state/totals/nst-est2015-01.csv", cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_character(),
X4 = col_character(),
X5 = col_number(),
X6 = col_number(),
X7 = col_number(),
X8 = col_number(),
X9 = col_number()
), col_names = F) #reading csv file
a1 <-data.frame(a[10:59,]) #selecting the row 10 to row 59 only
colnames (a1) = c("state", "census", "estbase", "est2010", "est2011", "est2012", "est2013", "est2014", "est2015") #set column names
write.csv(a1,"/home/jiasl/c.csv", row.names=F)
a2 <-read_csv("c.csv", cols(
state = col_character(),
census = col_number(),
estbase = col_number(),
est2010 = col_number(),
est2011 = col_number(),
est2012 = col_number(),
est2013 = col_number(),
est2014 = col_number(),
est2015 = col_number()
), col_names = T) #write and read csv file to avoid the numeric problem
a2$state <-gsub (".", "", a2$state, fixed = T) #to take the period that in front of state name out
write.csv(a2,"/home/jiasl/clean.csv", row.names=F) #creat the clean.csv file on the local machine
areascsvb1 <-read.csv("areas.csv") #read csv file
areascsvb2 <-areascsvb1 [,4:5] #only take column 4 and column 5 (states name and land area)
colnames (areascsvb2) = c("state", "Land.sq.miles") #set the column name
areascsvb2 <-areascsvb2[order(areascsvb2$state),] #to set data with the order of state name to avoid using wrong data in calculation
cleancsvb1 <-read_csv("clean.csv", cols(
state = col_character(),
census = col_number(),
estbase = col_number(),
est2010 = col_number(),
est2011 = col_integer(),
est2012 = col_integer(),
est2013 = col_integer(),
est2014 = col_integer(),
est2015 = col_integer()
), col_names = T) #read the clean.csv file
joined1 <-data.frame() #set an empty data frame and name it as joined1
joined1 <-mutate(data.frame(cleancsvb1$est2015/areascsvb2$Land.sq.miles)) #calculation of population density
colnames(joined1) = c("Population Density (Based on 2015 Population)") #set column name
blueredstatescsvb1 <-read.csv("blueredstates.csv") #read blueredstates.csv file
blueredstatescsvb1 <-blueredstatescsvb1[order(blueredstatescsvb1$State),] #order the data with the states name to avoid using wrong data
joined2 <-cbind(blueredstatescsvb1, joined1) #combine joined1 with blueredstatescsvb1
group_by (joined2, Overall) %>% #using piping
summarise(popden = mean(joined2$`Population Density`),
MedianHouseholdIncome = mean(joined2$MedianHouseholdIncome),
Land.sq.miles = mean(areascsvb2$Land.sq.miles)) #grouped by overall and calculate mean and show the summary
## # A tibble: 2 x 4
## Overall popden MedianHouseholdIncome Land.sq.miles
## <fct> <dbl> <dbl> <dbl>
## 1 D 421 54110 70726
## 2 R 421 54110 70726
cleancsvc1 <-read_csv("clean.csv", cols(
state = col_character(),
census = col_number(),
estbase = col_number(),
est2010 = col_number(),
est2011 = col_integer(),
est2012 = col_integer(),
est2013 = col_integer(),
est2014 = col_integer(),
est2015 = col_integer()
), col_names = T) #read clean.csv file
cleancsvc2 <-cleancsvc1[,-1:-3] #delete column 1 to column 3
cleancsvc2 %>% boxplot() #output the boxplot with population data from year 2010 to 2015
The box plot:
z <-cleancsvc1 %>%
summarise('2010' = mean(cleancsvc1$est2010),
'2011' = mean(cleancsvc1$est2011),
'2012' = mean(cleancsvc1$est2012),
'2013' = mean(cleancsvc1$est2013),
'2014' = mean(cleancsvc1$est2014),
'2015' = mean(cleancsvc1$est2015)) #summary the mean popuation for each year.
z <-t(z) #switch the column and row
z <-cbind(c(2010, 2011, 2012, 2013, 2014, 2015), z) #add a column of each year
colnames(z) <- c('Years', 'Population') #set the column name
plot(z, main = "Mean of populations across states over the years", ylim = c(6000000, 6500000), panel.first=grid(lty = 2), col = "red", pch =19) #generating the point plot
The point plot: