Part A

a <-read_csv("https://www2.census.gov/programs-surveys/popest/tables/2010-2015/state/totals/nst-est2015-01.csv", cols(
  X1 = col_character(),
  X2 = col_character(),
  X3 = col_character(),
  X4 = col_character(),
  X5 = col_number(),
  X6 = col_number(),
  X7 = col_number(),
  X8 = col_number(),
  X9 = col_number()
), col_names = F) #reading csv file
a1 <-data.frame(a[10:59,]) #selecting the row 10 to row 59 only

colnames (a1) = c("state", "census", "estbase", "est2010", "est2011", "est2012", "est2013", "est2014", "est2015") #set column names 

write.csv(a1,"/home/jiasl/c.csv", row.names=F)
a2 <-read_csv("c.csv", cols(
  state = col_character(),
  census = col_number(),
  estbase = col_number(),
  est2010 = col_number(),
  est2011 = col_number(),
  est2012 = col_number(),
  est2013 = col_number(),
  est2014 = col_number(),
  est2015 = col_number()
), col_names = T)  #write and read csv file to avoid the numeric problem

a2$state <-gsub (".", "", a2$state, fixed = T) #to take the period that in front of state name out
write.csv(a2,"/home/jiasl/clean.csv", row.names=F) #creat the clean.csv file on the local machine

areascsvb1 <-read.csv("areas.csv") #read csv file
areascsvb2 <-areascsvb1 [,4:5] #only take column 4 and column 5 (states name and land area)
colnames (areascsvb2) = c("state", "Land.sq.miles") #set the column name
areascsvb2 <-areascsvb2[order(areascsvb2$state),] #to set data with the order of state name to avoid using wrong data in calculation

cleancsvb1 <-read_csv("clean.csv", cols(
  state = col_character(),
  census = col_number(),
  estbase = col_number(),
  est2010 = col_number(),
  est2011 = col_integer(),
  est2012 = col_integer(),
  est2013 = col_integer(),
  est2014 = col_integer(),
  est2015 = col_integer()
), col_names = T) #read the clean.csv file


joined1 <-data.frame() #set an empty data frame and name it as joined1
joined1 <-mutate(data.frame(cleancsvb1$est2015/areascsvb2$Land.sq.miles)) #calculation of population density
colnames(joined1) = c("Population Density (Based on 2015 Population)") #set column name

blueredstatescsvb1 <-read.csv("blueredstates.csv") #read blueredstates.csv file
blueredstatescsvb1 <-blueredstatescsvb1[order(blueredstatescsvb1$State),] #order the data with the states name to avoid using wrong data
joined2 <-cbind(blueredstatescsvb1, joined1) #combine joined1 with blueredstatescsvb1

group_by (joined2, Overall) %>%  #using piping
  summarise(popden = mean(joined2$`Population Density`),
            MedianHouseholdIncome = mean(joined2$MedianHouseholdIncome),
            Land.sq.miles = mean(areascsvb2$Land.sq.miles)) #grouped by overall and calculate mean and show the summary

## # A tibble: 2 x 4
##   Overall popden MedianHouseholdIncome Land.sq.miles
##   <fct>    <dbl>                 <dbl>         <dbl>
## 1 D          421                 54110         70726
## 2 R          421                 54110         70726

cleancsvc1 <-read_csv("clean.csv", cols(
  state = col_character(),
  census = col_number(),
  estbase = col_number(),
  est2010 = col_number(),
  est2011 = col_integer(),
  est2012 = col_integer(),
  est2013 = col_integer(),
  est2014 = col_integer(),
  est2015 = col_integer()
), col_names = T) #read clean.csv file
cleancsvc2 <-cleancsvc1[,-1:-3] #delete column 1 to column 3

cleancsvc2 %>% boxplot() #output the boxplot with population data from year 2010 to 2015

The box plot:

z <-cleancsvc1 %>% 
  summarise('2010' = mean(cleancsvc1$est2010),
            '2011' = mean(cleancsvc1$est2011),
            '2012' = mean(cleancsvc1$est2012),
            '2013' = mean(cleancsvc1$est2013),
            '2014' = mean(cleancsvc1$est2014),
            '2015' = mean(cleancsvc1$est2015))  #summary the mean popuation for each year.

z <-t(z) #switch the column and row
z <-cbind(c(2010, 2011, 2012, 2013, 2014, 2015), z) #add a column of each year
colnames(z) <- c('Years', 'Population') #set the column name

plot(z, main = "Mean of populations across states over the years", ylim = c(6000000, 6500000), panel.first=grid(lty = 2), col = "red", pch =19) #generating the point plot

The point plot:

a03a

Jiasheng Li

February 5, 2018

Part A