#Load census dataset
load("D:\\fall_2022\\MATH5640_Comp.Stat/census.RData")

Part 1: Loading and cleaning

Problem1.

How many states are represented among the 74020 census tracts? How many counties?

old_state<-unique(census$State_name)
length(unique(census$State_name))
## [1] 52

How many counties?

length(unique(census$County_name))
## [1] 1955

Problem 2.

Columns 8 through 31 of the census data frame represent numeric variables, but columns 8 and 9 are not stored as such. These two are measured in US dollars: median household income (Med_HHD_Inc_ACS_09_13) and median house value (Med_House_value_ACS_09_13). What are the classes of these columns? Check how many missing values are there for these two variabels.

#Data type of Med_HHD_Inc_ACS_09_13
class(census$Med_HHD_Inc_ACS_09_13)
## [1] "factor"
#Data type of Med_House_value_ACS_09_13
class(census$Med_House_value_ACS_09_13)
## [1] "factor"

Check how many missing values are there for these two variabels.

#Check the N/A value of the variable Med_HHD_Inc_ACS_09_13
which(is.na(census$Med_HHD_Inc_ACS_09_13))
## integer(0)
#Count the N/A value
sum(is.na(census$Med_HHD_Inc_ACS_09_13))
## [1] 0
#Check empty value of the variable Med_HHD_Inc_ACS_09_13
head(which(census$Med_HHD_Inc_ACS_09_13==""))
## [1]  44 107 108 109 806 869
#Count the empty value
sum(census$Med_HHD_Inc_ACS_09_13=="")
## [1] 1020

We see that there are no N/A values but 1020 empty values in Med_HHD_Inc_ACS_09_13 variable.

#Check the missing value of the variable Med_House_value_ACS_09_13
which(is.na(census$Med_House_value_ACS_09_13))
## integer(0)
#Count the missing value
sum(is.na(census$Med_House_value_ACS_09_13))
## [1] 0
#Check empty value of the variable Med_House_value_ACS_09_13
head(which(census$Med_House_value_ACS_09_13==""),5)
## [1]  44 107 108 109 229
#Count the empty value
sum(census$Med_House_value_ACS_09_13=="")
## [1] 1804

We have no N/A values but 1804 empty values in Med_House_value_ACS_09_13.

Prolem 3.

Convert these two columns into numbers (in whole US dollars). (Hint: it will be helpful to first convert into strings, then remove any non-numeric characters, then convert into numbers.) For example, $63,030 should be converted into the integer 63030. Check your answer by printing out the summary() of these two new columns. Make sure that empty entries (““) are properly converted to NA. Check the number of missing values in these two variables again and make sure your answer matches with the number obtained in problem 2.

#Convert non-numeric values of Med_HHD_Inc_ACS_09_13 variable into numerical values
#Function to convert non-numeric to numeric
convert_numeric<-function(a){
  x<-gsub("\\$"," ",a)
  x<-as.numeric(gsub(",","",x))
  return(x)
}
#Convert values of Med_HHD_Inc_ACS_09_13 variable
census$Med_HHD_Inc_ACS_09_13<- sapply(census$Med_HHD_Inc_ACS_09_13,convert_numeric)
#Check if the empty values now become N\A values.
head(which(is.na(census$Med_HHD_Inc_ACS_09_13)),5)
## [1]  44 107 108 109 806
#Convert values of Med_House_value_ACS_09_13 variable
census$Med_House_value_ACS_09_13<- sapply(census$Med_House_value_ACS_09_13,convert_numeric)
#Check if the empty values now become N\A values.
head(which(is.na(census$Med_House_value_ACS_09_13)),5)
## [1]  44 107 108 109 229

We obtain the results in the Problem 2, we see that the position of empty values of Med_HHD_Inc_ACS_09_13 now matches with the position of N/A values after converting non-numeric values to numeric values. Similarly to the cases of the variable Med_House_value_ACS_09_13, when all empty values at first now are converted into N/A values.

Problem 4

Several entries are missing in this data set, including the ones you discovered in the previous question. Compute the number of missing entries in each row, and save the vector as num.na.row. Then, obtain the indices of rows containing any missing values and save them in a vector named contains.na. What is the average number of missing values among the rows that contain at least one missing entry?

#Count number of missing entries in each row
num.na.row<-apply(census, 1, function(x) sum(is.na(x)))
num.na.row<-data.frame(num.na.row)
print(head(num.na.row))
##   num.na.row
## 1          0
## 2          0
## 3          0
## 4          0
## 5          0
## 6          0
#Index of rows containing any missing values
contains.na.<-which(num.na.row>0)
print(head(contains.na.,5))
## [1] 36 37 38 39 44

Problem 5

Are there any states with no missing values? If so, print out the names of all such states. Please do NOT print out replicated State names!

Yes, there is on state, which is West Virginia with no missing values.

unique(census[contains.na.,"State_name"])
##  [1] "Alabama"              "Alaska"               "Arizona"             
##  [4] "Arkansas"             "California"           "Colorado"            
##  [7] "Connecticut"          "Delaware"             "District of Columbia"
## [10] "Florida"              "Georgia"              "Hawaii"              
## [13] "Idaho"                "Illinois"             "Indiana"             
## [16] "Iowa"                 "Kansas"               "Kentucky"            
## [19] "Louisiana"            "Maine"                "Maryland"            
## [22] "Massachusetts"        "Michigan"             "Minnesota"           
## [25] "Mississippi"          "Missouri"             "Montana"             
## [28] "Nebraska"             "Nevada"               "New Hampshire"       
## [31] "New Jersey"           "New Mexico"           "New York"            
## [34] "North Carolina"       "North Dakota"         "Ohio"                
## [37] "Oklahoma"             "Oregon"               "Pennsylvania"        
## [40] "Rhode Island"         "South Carolina"       "South Dakota"        
## [43] "Tennessee"            "Texas"                "Utah"                
## [46] "Vermont"              "Virginia"             "Washington"          
## [49] "Wisconsin"            "Wyoming"              "Puerto Rico"
setdiff(unique(census$State_name),unique(census[contains.na.,"State_name"]))
## [1] "West Virginia"

Problem 6

Redefine the census data frame by removing rows that have missing values, as per the contains.na vector computed in Part 1 Question 4. Check that the new census data frame has now 70877 rows. How many states and counties are represented in this new data frame? What states (if any) have been thrown out compared to the original data frame?

#Delete rows that contain missing values.
census<-census[-contains.na.,]
print(dim(census))
## [1] 70877    31

After deleting rows contain misisng values, we have 70877 rows left in Cencus data.

#Number of states and counties are  represented in this new data frame
length(unique(census$State_name))
## [1] 51
length(unique(census$County_name))
## [1] 1861
#States in the new dataframe
new_state<-unique(census$State_name)
setdiff(old_state,new_state)
## [1] "Puerto Rico"

We see that there is one state “Puerto Rico is eliminated in the new dataframe.

Part 2: Exploratory statistics

Problem 1.

There are several variables which count the percentage of respondents in various age categories (they all start with pct_Pop_, col 12-16). Use these to determine, for each census tract, the percentages of the population that are less than 5 years old, and append these values in a new column called pct_Pop_0_4_ACS_09_13 to the census data frame. Then, use this new column, along with the existing Tot_Population_ACS_09_13 column, to determine the number of 0-4 year olds in each state. Which state has the highest number, and what is its value?

#Create a new column pct_Pop_0_4_ACS_09_13

census$pct_Pop_0_4_ACS_09_13<- 100-rowSums(census[,12:16])
head(census$pct_Pop_0_4_ACS_09_13,5)
## [1] 4.480088 4.331210 4.023553 3.361345 6.460234
census$total_Pop_0_4<-(census$pct_Pop_0_4_ACS_09_13/100)*census$Tot_Population_ACS_09_13
head(census$total_Pop_0_4)
## [1]  81 102 123 148 701 229
#Sum of Total Population from 0-4 each State
total_pop_each_state<-aggregate(census$total_Pop_0_4, by=list(census$State_name), FUN=sum)
total_pop_each_state
##                 Group.1       x
## 1               Alabama  299668
## 2                Alaska   45727
## 3               Arizona  427270
## 4              Arkansas  194959
## 5            California 2492629
## 6              Colorado  335309
## 7           Connecticut  196593
## 8              Delaware   55249
## 9  District of Columbia   35873
## 10              Florida 1066023
## 11              Georgia  670322
## 12               Hawaii   80005
## 13                Idaho  116394
## 14             Illinois  819400
## 15              Indiana  426394
## 16                 Iowa  197515
## 17               Kansas  200238
## 18             Kentucky  274805
## 19            Louisiana  308670
## 20                Maine   66721
## 21             Maryland  360895
## 22        Massachusetts  361730
## 23             Michigan  583582
## 24            Minnesota  350227
## 25          Mississippi  205023
## 26             Missouri  381636
## 27              Montana   55168
## 28             Nebraska  128974
## 29               Nevada  180451
## 30        New Hampshire   67599
## 31           New Jersey  534708
## 32           New Mexico  129706
## 33             New York 1119471
## 34       North Carolina  615250
## 35         North Dakota   42914
## 36                 Ohio  703858
## 37             Oklahoma  261688
## 38               Oregon  233154
## 39         Pennsylvania  721816
## 40         Rhode Island   55716
## 41       South Carolina  296552
## 42         South Dakota   54267
## 43            Tennessee  401796
## 44                Texas 1891368
## 45                 Utah  255851
## 46              Vermont   30977
## 47             Virginia  502254
## 48           Washington  430341
## 49        West Virginia  103578
## 50            Wisconsin  340178
## 51              Wyoming   37193
#The state has the highest number of children from 0 to 4 years old
total_pop_each_state[which.max(total_pop_each_state$x),]
##      Group.1       x
## 5 California 2492629

We have California is the state that has the highest children under 5 years old.

Problem 2.

Using your answer from the last question, determine the percentage of 0-4 year olds in each state. Which state has the highest percentage, and what is its value?

total_pop<-aggregate(census$Tot_Population_ACS_09_13, by=list(census$State_name), FUN=sum)
total_pop_each_state$population<-total_pop$x
total_pop_each_state$pct_population_0_4<-total_pop_each_state$x/total_pop_each_state$population
total_pop_each_state
##                 Group.1       x population pct_population_0_4
## 1               Alabama  299668    4770947         0.06281101
## 2                Alaska   45727     646314         0.07075044
## 3               Arizona  427270    6257267         0.06828381
## 4              Arkansas  194959    2928198         0.06657986
## 5            California 2492629   37003067         0.06736277
## 6              Colorado  335309    5049234         0.06640789
## 7           Connecticut  196593    3550823         0.05536547
## 8              Delaware   55249     904744         0.06106589
## 9  District of Columbia   35873     609222         0.05888330
## 10              Florida 1066023   18860945         0.05652013
## 11              Georgia  670322    9717995         0.06897740
## 12               Hawaii   80005    1308727         0.06113192
## 13                Idaho  116394    1568566         0.07420408
## 14             Illinois  819400   12800233         0.06401446
## 15              Indiana  426394    6461435         0.06599060
## 16                 Iowa  197515    3050415         0.06475021
## 17               Kansas  200238    2848260         0.07030187
## 18             Kentucky  274805    4324962         0.06353929
## 19            Louisiana  308670    4536776         0.06803730
## 20                Maine   66721    1321296         0.05049663
## 21             Maryland  360895    5763400         0.06261842
## 22        Massachusetts  361730    6491196         0.05572625
## 23             Michigan  583582    9806329         0.05951075
## 24            Minnesota  350227    5329758         0.06571161
## 25          Mississippi  205023    2961064         0.06923964
## 26             Missouri  381636    5973203         0.06389135
## 27              Montana   55168     936711         0.05889543
## 28             Nebraska  128974    1819710         0.07087613
## 29               Nevada  180451    2704718         0.06671712
## 30        New Hampshire   67599    1304204         0.05183162
## 31           New Jersey  534708    8780115         0.06089989
## 32           New Mexico  129706    1934604         0.06704525
## 33             New York 1119471   18716220         0.05981288
## 34       North Carolina  615250    9555763         0.06438523
## 35         North Dakota   42914     666448         0.06439212
## 36                 Ohio  703858   11474718         0.06133990
## 37             Oklahoma  261688    3745888         0.06986007
## 38               Oregon  233154    3857064         0.06044857
## 39         Pennsylvania  721816   12645569         0.05708055
## 40         Rhode Island   55716    1047838         0.05317234
## 41       South Carolina  296552    4639701         0.06391619
## 42         South Dakota   54267     782924         0.06931324
## 43            Tennessee  401796    6363531         0.06314042
## 44                Texas 1891368   25120728         0.07529113
## 45                 Utah  255851    2765089         0.09252903
## 46              Vermont   30977     619459         0.05000654
## 47             Virginia  502254    7984582         0.06290298
## 48           Washington  430341    6714125         0.06409487
## 49        West Virginia  103578    1851142         0.05595357
## 50            Wisconsin  340178    5450260         0.06241500
## 51              Wyoming   37193     540354         0.06883080
total_pop_each_state[which.max(total_pop_each_state$pct_population_0_4),]
##    Group.1      x population pct_population_0_4
## 45    Utah 255851    2765089         0.09252903

Since california has the highest population under 5 years old, but Utah is the state has the highest percentage of population under 5 years old.

Problem 3

Calculate the correlation between each of the numeric variables, columns 8 through 31 of the census data frame. Which two variables have the highest positive correlation? Which two variables have the lowest negative correlation?

#Matrix contains correlation coefficient 
cor<-round(cor(census[,8:31]),digits = 2)

#Finding minimum correlation coefficient
min = which(cor == min(cor), arr.ind = TRUE)  
print(paste("Minimum correlation coefficient: ", cor[min]))
## [1] "Minimum correlation coefficient:  -0.79"
## [2] "Minimum correlation coefficient:  -0.79"
print(min)
##                              row col
## pct_Single_Unit_ACS_09_13     24  23
## pct_Renter_Occp_HU_ACS_09_13  23  24
#Finding maximum correlation coefficient
max<-which(cor==max(cor[which(cor!=1,arr.ind = TRUE)]),arr.ind=TRUE)
print(paste("Maximum correlation coefficient: ", cor[max]))
## [1] "Maximum correlation coefficient:  0.74"
## [2] "Maximum correlation coefficient:  0.74"
print(max)
##                       row col
## pct_College_ACS_09_13  14   1
## Med_HHD_Inc_ACS_09_13   1  14

We have pct_College_ACS_09_13 and Med_HHD_Inc_ACS_09_13 has the highest positive coefficient, while pct_Single_Unit_ACS_09_13 and pct_Renter_Occp_HU_ACS_09_13 has the lowest negative correlation

Problem 4

Plot a histogram of Med_House_value_ACS_09_13, and label the axes appropriately. See that small bump at the value 1000001? This is a due to a common practice in public data sets of “top-coding”, i.e., censoring unusually large values to protect the anonymity of survey respondents.

hist(census$Med_House_value_ACS_09_13,breaks=10,main="Median House value",xlab='Median value')

Problem 5

It is possible that the tracts that have been top-coded differ significantly from the other tracts, in other ways than the median house value. The following code computes a t-test between two groups of census tracts, on any given variable. The two groups being compared are: all tracts with median house value equal to the max (1000001) and all tracts with median house value less than the max (<1000001). It then returns a p-value for the test. Note that a lower p-value means a more significant difference between top-coded and non-top-coded tracts, according to the variable in question

#t-test function
my.test = function(var) {
  group = census$Med_House_value_ACS_09_13 == 1000001
  p.val = t.test(var[group], var[!group])$p.value
  return(p.val)
}
#Finding p_value for each variable
p_value<-apply(census[,10:31],2,my.test)
# 2 smallest p-values
head(sort(p_value),2)
##     pct_College_ACS_09_13 pct_Not_HS_Grad_ACS_09_13 
##             1.511841e-321             1.569123e-256

Part III

Problem 1

Write a function plot.sample() that takes in five arguments: x, y, nsample, xlab, and ylab. The first two arguments are variables to be plotted. The third is the number of points to be randomly sampled, with a default of 500. (Hence, if x and y are vectors of length, say, 5000, then a random 500 of the total 5000 x-y pairs will be plotted, by default.) The last two are the x and y labels, both with defaults of “” (the empty string)

plot.sample<-function(x,y,nsample, xlab, ylab){
  #Check the length of two input x and y
  if(length(x)==length(y)){
    c<-data.frame(x,y)
    
    #Check if the input sample size exceed total number of data points
    if(nsample>nrow(c)){
      stop("Sample  exceed the total number of data points ")
    }
    else c<-data.frame(x,y)
    set.seed(300)
    c<-c[sample(seq_len(nrow(c)), nsample), ]
    plot(c,xlab = xlab,ylab=ylab)
  }
  #Check if the length of two input x and y is different
  if(length(x)!=length(y)){
    stop("Not the same length")
  }
  
}
plot.sample(x=census$Med_HHD_Inc_ACS_09_13,y=census$Med_House_value_ACS_09_13,nsample = 500, xlab="Median HHD income", ylab="Median house value")

Problem 2

write a function add.lineartrend.info(), which is designed to be called after plot.sample(). This takes as input x, y for variables that already appear on the x- and y-axes, and does two things:

• adds the best fit line (produced lm()) function in red, and with twice the default width (using the lwd=2 option);

• adds a title to the plot with the numeric correlation of the two variables, using 3 significant digits.

add.lineartrend.info<-function(x,y){
  c<-cbind(x,y)
  set.seed(300)
  c<-c[sample(seq_len(nrow(c)), 500), ]
  c<-data.frame(c)
  reg<-lm(y~x,data=c)
  corr<-cor(c$x,c$y)
  abline(reg, col = "darkgreen",lwd=2)
  title(main="")
  text(min(c$x), max(c$y) ,bquote(italic('r=')~.(round(corr, 3))), adj=0, cex=0.8,col="red")
}
#Plot scatter plot
plot.sample(x=census$Med_HHD_Inc_ACS_09_13,y=census$Med_House_value_ACS_09_13,nsample = 500, xlab="Median HHD income", ylab="Median house value")
#Plot regression line
add.lineartrend.info(x=census$Med_HHD_Inc_ACS_09_13,y=census$Med_House_value_ACS_09_13)

Problem 3

Write a function plot.all() which takes as input dataset and nsample, a data frame, and the number of points to be sampled (with default 500). This function will mimick the behavior of plot() when applied to data frames. In other words, if dataset has p columns, then plot.all() should produce a p by p retangular grid of plots, where the plot in entry i, j of the grid uses column i for the x-axis data, and column j for the y-axis data.

plot.all = function(dataset, nsample=500) {
  p = ncol(dataset)
  orig.par = par()
  # Set the margins, and plotting grid
  par(mar=c(2,2,2,2))
  par(mfrow=c(p,p))
  # TODO: your plotting code goes here
  set.seed(300)
  dataset<-dataset[sample(seq_len(nrow(dataset)), nsample), ]
  for (i in 1:ncol(dataset)){
    for(j in 1:ncol(dataset)){
      colname<-colnames(mydat)
      
      plot.sample(x=dataset[,i],y=dataset[,j],nsample = 500, xlab="", ylab="")
      text(max(dataset[,i]),y=max(dataset[,j]),colname[i],col="blue",cex=0.6,font=4)
      text(max(dataset[,i]),y=min(dataset[,j]),colname[j],col="red",cex=0.6,font=4)
      add.lineartrend.info(x=dataset[,i],y=dataset[,j])
     
      
    }
  }
  
  # Reset the margins, and plotting grid
  par(mar=orig.par$mar)
  par(mfrow=orig.par$mfrow)
}



mydat = census[,c("Med_HHD_Inc_ACS_09_13", "Med_House_value_ACS_09_13","pct_College_ACS_09_13","Mail_Return_Rate_CEN_2010")]

plot.all(mydat)

This relationship does make sense to me.