#Load census dataset
load("D:\\fall_2022\\MATH5640_Comp.Stat/census.RData")
How many states are represented among the 74020 census tracts? How many counties?
old_state<-unique(census$State_name)
length(unique(census$State_name))
## [1] 52
How many counties?
length(unique(census$County_name))
## [1] 1955
Columns 8 through 31 of the census data frame represent numeric variables, but columns 8 and 9 are not stored as such. These two are measured in US dollars: median household income (Med_HHD_Inc_ACS_09_13) and median house value (Med_House_value_ACS_09_13). What are the classes of these columns? Check how many missing values are there for these two variabels.
#Data type of Med_HHD_Inc_ACS_09_13
class(census$Med_HHD_Inc_ACS_09_13)
## [1] "factor"
#Data type of Med_House_value_ACS_09_13
class(census$Med_House_value_ACS_09_13)
## [1] "factor"
Check how many missing values are there for these two variabels.
#Check the N/A value of the variable Med_HHD_Inc_ACS_09_13
which(is.na(census$Med_HHD_Inc_ACS_09_13))
## integer(0)
#Count the N/A value
sum(is.na(census$Med_HHD_Inc_ACS_09_13))
## [1] 0
#Check empty value of the variable Med_HHD_Inc_ACS_09_13
head(which(census$Med_HHD_Inc_ACS_09_13==""))
## [1] 44 107 108 109 806 869
#Count the empty value
sum(census$Med_HHD_Inc_ACS_09_13=="")
## [1] 1020
We see that there are no N/A values but 1020 empty values in Med_HHD_Inc_ACS_09_13 variable.
#Check the missing value of the variable Med_House_value_ACS_09_13
which(is.na(census$Med_House_value_ACS_09_13))
## integer(0)
#Count the missing value
sum(is.na(census$Med_House_value_ACS_09_13))
## [1] 0
#Check empty value of the variable Med_House_value_ACS_09_13
head(which(census$Med_House_value_ACS_09_13==""),5)
## [1] 44 107 108 109 229
#Count the empty value
sum(census$Med_House_value_ACS_09_13=="")
## [1] 1804
We have no N/A values but 1804 empty values in Med_House_value_ACS_09_13.
Convert these two columns into numbers (in whole US dollars). (Hint: it will be helpful to first convert into strings, then remove any non-numeric characters, then convert into numbers.) For example, $63,030 should be converted into the integer 63030. Check your answer by printing out the summary() of these two new columns. Make sure that empty entries (““) are properly converted to NA. Check the number of missing values in these two variables again and make sure your answer matches with the number obtained in problem 2.
#Convert non-numeric values of Med_HHD_Inc_ACS_09_13 variable into numerical values
#Function to convert non-numeric to numeric
convert_numeric<-function(a){
x<-gsub("\\$"," ",a)
x<-as.numeric(gsub(",","",x))
return(x)
}
#Convert values of Med_HHD_Inc_ACS_09_13 variable
census$Med_HHD_Inc_ACS_09_13<- sapply(census$Med_HHD_Inc_ACS_09_13,convert_numeric)
#Check if the empty values now become N\A values.
head(which(is.na(census$Med_HHD_Inc_ACS_09_13)),5)
## [1] 44 107 108 109 806
#Convert values of Med_House_value_ACS_09_13 variable
census$Med_House_value_ACS_09_13<- sapply(census$Med_House_value_ACS_09_13,convert_numeric)
#Check if the empty values now become N\A values.
head(which(is.na(census$Med_House_value_ACS_09_13)),5)
## [1] 44 107 108 109 229
We obtain the results in the Problem 2, we see that the position of empty values of Med_HHD_Inc_ACS_09_13 now matches with the position of N/A values after converting non-numeric values to numeric values. Similarly to the cases of the variable Med_House_value_ACS_09_13, when all empty values at first now are converted into N/A values.
Several entries are missing in this data set, including the ones you discovered in the previous question. Compute the number of missing entries in each row, and save the vector as num.na.row. Then, obtain the indices of rows containing any missing values and save them in a vector named contains.na. What is the average number of missing values among the rows that contain at least one missing entry?
#Count number of missing entries in each row
num.na.row<-apply(census, 1, function(x) sum(is.na(x)))
num.na.row<-data.frame(num.na.row)
print(head(num.na.row))
## num.na.row
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
#Index of rows containing any missing values
contains.na.<-which(num.na.row>0)
print(head(contains.na.,5))
## [1] 36 37 38 39 44
Are there any states with no missing values? If so, print out the names of all such states. Please do NOT print out replicated State names!
Yes, there is on state, which is West Virginia with no missing values.
unique(census[contains.na.,"State_name"])
## [1] "Alabama" "Alaska" "Arizona"
## [4] "Arkansas" "California" "Colorado"
## [7] "Connecticut" "Delaware" "District of Columbia"
## [10] "Florida" "Georgia" "Hawaii"
## [13] "Idaho" "Illinois" "Indiana"
## [16] "Iowa" "Kansas" "Kentucky"
## [19] "Louisiana" "Maine" "Maryland"
## [22] "Massachusetts" "Michigan" "Minnesota"
## [25] "Mississippi" "Missouri" "Montana"
## [28] "Nebraska" "Nevada" "New Hampshire"
## [31] "New Jersey" "New Mexico" "New York"
## [34] "North Carolina" "North Dakota" "Ohio"
## [37] "Oklahoma" "Oregon" "Pennsylvania"
## [40] "Rhode Island" "South Carolina" "South Dakota"
## [43] "Tennessee" "Texas" "Utah"
## [46] "Vermont" "Virginia" "Washington"
## [49] "Wisconsin" "Wyoming" "Puerto Rico"
setdiff(unique(census$State_name),unique(census[contains.na.,"State_name"]))
## [1] "West Virginia"
Redefine the census data frame by removing rows that have missing values, as per the contains.na vector computed in Part 1 Question 4. Check that the new census data frame has now 70877 rows. How many states and counties are represented in this new data frame? What states (if any) have been thrown out compared to the original data frame?
#Delete rows that contain missing values.
census<-census[-contains.na.,]
print(dim(census))
## [1] 70877 31
After deleting rows contain misisng values, we have 70877 rows left in Cencus data.
#Number of states and counties are represented in this new data frame
length(unique(census$State_name))
## [1] 51
length(unique(census$County_name))
## [1] 1861
#States in the new dataframe
new_state<-unique(census$State_name)
setdiff(old_state,new_state)
## [1] "Puerto Rico"
We see that there is one state “Puerto Rico is eliminated in the new dataframe.
There are several variables which count the percentage of respondents in various age categories (they all start with pct_Pop_, col 12-16). Use these to determine, for each census tract, the percentages of the population that are less than 5 years old, and append these values in a new column called pct_Pop_0_4_ACS_09_13 to the census data frame. Then, use this new column, along with the existing Tot_Population_ACS_09_13 column, to determine the number of 0-4 year olds in each state. Which state has the highest number, and what is its value?
#Create a new column pct_Pop_0_4_ACS_09_13
census$pct_Pop_0_4_ACS_09_13<- 100-rowSums(census[,12:16])
head(census$pct_Pop_0_4_ACS_09_13,5)
## [1] 4.480088 4.331210 4.023553 3.361345 6.460234
census$total_Pop_0_4<-(census$pct_Pop_0_4_ACS_09_13/100)*census$Tot_Population_ACS_09_13
head(census$total_Pop_0_4)
## [1] 81 102 123 148 701 229
#Sum of Total Population from 0-4 each State
total_pop_each_state<-aggregate(census$total_Pop_0_4, by=list(census$State_name), FUN=sum)
total_pop_each_state
## Group.1 x
## 1 Alabama 299668
## 2 Alaska 45727
## 3 Arizona 427270
## 4 Arkansas 194959
## 5 California 2492629
## 6 Colorado 335309
## 7 Connecticut 196593
## 8 Delaware 55249
## 9 District of Columbia 35873
## 10 Florida 1066023
## 11 Georgia 670322
## 12 Hawaii 80005
## 13 Idaho 116394
## 14 Illinois 819400
## 15 Indiana 426394
## 16 Iowa 197515
## 17 Kansas 200238
## 18 Kentucky 274805
## 19 Louisiana 308670
## 20 Maine 66721
## 21 Maryland 360895
## 22 Massachusetts 361730
## 23 Michigan 583582
## 24 Minnesota 350227
## 25 Mississippi 205023
## 26 Missouri 381636
## 27 Montana 55168
## 28 Nebraska 128974
## 29 Nevada 180451
## 30 New Hampshire 67599
## 31 New Jersey 534708
## 32 New Mexico 129706
## 33 New York 1119471
## 34 North Carolina 615250
## 35 North Dakota 42914
## 36 Ohio 703858
## 37 Oklahoma 261688
## 38 Oregon 233154
## 39 Pennsylvania 721816
## 40 Rhode Island 55716
## 41 South Carolina 296552
## 42 South Dakota 54267
## 43 Tennessee 401796
## 44 Texas 1891368
## 45 Utah 255851
## 46 Vermont 30977
## 47 Virginia 502254
## 48 Washington 430341
## 49 West Virginia 103578
## 50 Wisconsin 340178
## 51 Wyoming 37193
#The state has the highest number of children from 0 to 4 years old
total_pop_each_state[which.max(total_pop_each_state$x),]
## Group.1 x
## 5 California 2492629
We have California is the state that has the highest children under 5 years old.
Using your answer from the last question, determine the percentage of 0-4 year olds in each state. Which state has the highest percentage, and what is its value?
total_pop<-aggregate(census$Tot_Population_ACS_09_13, by=list(census$State_name), FUN=sum)
total_pop_each_state$population<-total_pop$x
total_pop_each_state$pct_population_0_4<-total_pop_each_state$x/total_pop_each_state$population
total_pop_each_state
## Group.1 x population pct_population_0_4
## 1 Alabama 299668 4770947 0.06281101
## 2 Alaska 45727 646314 0.07075044
## 3 Arizona 427270 6257267 0.06828381
## 4 Arkansas 194959 2928198 0.06657986
## 5 California 2492629 37003067 0.06736277
## 6 Colorado 335309 5049234 0.06640789
## 7 Connecticut 196593 3550823 0.05536547
## 8 Delaware 55249 904744 0.06106589
## 9 District of Columbia 35873 609222 0.05888330
## 10 Florida 1066023 18860945 0.05652013
## 11 Georgia 670322 9717995 0.06897740
## 12 Hawaii 80005 1308727 0.06113192
## 13 Idaho 116394 1568566 0.07420408
## 14 Illinois 819400 12800233 0.06401446
## 15 Indiana 426394 6461435 0.06599060
## 16 Iowa 197515 3050415 0.06475021
## 17 Kansas 200238 2848260 0.07030187
## 18 Kentucky 274805 4324962 0.06353929
## 19 Louisiana 308670 4536776 0.06803730
## 20 Maine 66721 1321296 0.05049663
## 21 Maryland 360895 5763400 0.06261842
## 22 Massachusetts 361730 6491196 0.05572625
## 23 Michigan 583582 9806329 0.05951075
## 24 Minnesota 350227 5329758 0.06571161
## 25 Mississippi 205023 2961064 0.06923964
## 26 Missouri 381636 5973203 0.06389135
## 27 Montana 55168 936711 0.05889543
## 28 Nebraska 128974 1819710 0.07087613
## 29 Nevada 180451 2704718 0.06671712
## 30 New Hampshire 67599 1304204 0.05183162
## 31 New Jersey 534708 8780115 0.06089989
## 32 New Mexico 129706 1934604 0.06704525
## 33 New York 1119471 18716220 0.05981288
## 34 North Carolina 615250 9555763 0.06438523
## 35 North Dakota 42914 666448 0.06439212
## 36 Ohio 703858 11474718 0.06133990
## 37 Oklahoma 261688 3745888 0.06986007
## 38 Oregon 233154 3857064 0.06044857
## 39 Pennsylvania 721816 12645569 0.05708055
## 40 Rhode Island 55716 1047838 0.05317234
## 41 South Carolina 296552 4639701 0.06391619
## 42 South Dakota 54267 782924 0.06931324
## 43 Tennessee 401796 6363531 0.06314042
## 44 Texas 1891368 25120728 0.07529113
## 45 Utah 255851 2765089 0.09252903
## 46 Vermont 30977 619459 0.05000654
## 47 Virginia 502254 7984582 0.06290298
## 48 Washington 430341 6714125 0.06409487
## 49 West Virginia 103578 1851142 0.05595357
## 50 Wisconsin 340178 5450260 0.06241500
## 51 Wyoming 37193 540354 0.06883080
total_pop_each_state[which.max(total_pop_each_state$pct_population_0_4),]
## Group.1 x population pct_population_0_4
## 45 Utah 255851 2765089 0.09252903
Since california has the highest population under 5 years old, but Utah is the state has the highest percentage of population under 5 years old.
Calculate the correlation between each of the numeric variables, columns 8 through 31 of the census data frame. Which two variables have the highest positive correlation? Which two variables have the lowest negative correlation?
#Matrix contains correlation coefficient
cor<-round(cor(census[,8:31]),digits = 2)
#Finding minimum correlation coefficient
min = which(cor == min(cor), arr.ind = TRUE)
print(paste("Minimum correlation coefficient: ", cor[min]))
## [1] "Minimum correlation coefficient: -0.79"
## [2] "Minimum correlation coefficient: -0.79"
print(min)
## row col
## pct_Single_Unit_ACS_09_13 24 23
## pct_Renter_Occp_HU_ACS_09_13 23 24
#Finding maximum correlation coefficient
max<-which(cor==max(cor[which(cor!=1,arr.ind = TRUE)]),arr.ind=TRUE)
print(paste("Maximum correlation coefficient: ", cor[max]))
## [1] "Maximum correlation coefficient: 0.74"
## [2] "Maximum correlation coefficient: 0.74"
print(max)
## row col
## pct_College_ACS_09_13 14 1
## Med_HHD_Inc_ACS_09_13 1 14
We have pct_College_ACS_09_13 and Med_HHD_Inc_ACS_09_13 has the highest positive coefficient, while pct_Single_Unit_ACS_09_13 and pct_Renter_Occp_HU_ACS_09_13 has the lowest negative correlation
Plot a histogram of Med_House_value_ACS_09_13, and label the axes appropriately. See that small bump at the value 1000001? This is a due to a common practice in public data sets of “top-coding”, i.e., censoring unusually large values to protect the anonymity of survey respondents.
hist(census$Med_House_value_ACS_09_13,breaks=10,main="Median House value",xlab='Median value')
It is possible that the tracts that have been top-coded differ significantly from the other tracts, in other ways than the median house value. The following code computes a t-test between two groups of census tracts, on any given variable. The two groups being compared are: all tracts with median house value equal to the max (1000001) and all tracts with median house value less than the max (<1000001). It then returns a p-value for the test. Note that a lower p-value means a more significant difference between top-coded and non-top-coded tracts, according to the variable in question
#t-test function
my.test = function(var) {
group = census$Med_House_value_ACS_09_13 == 1000001
p.val = t.test(var[group], var[!group])$p.value
return(p.val)
}
#Finding p_value for each variable
p_value<-apply(census[,10:31],2,my.test)
# 2 smallest p-values
head(sort(p_value),2)
## pct_College_ACS_09_13 pct_Not_HS_Grad_ACS_09_13
## 1.511841e-321 1.569123e-256
Write a function plot.sample() that takes in five arguments: x, y, nsample, xlab, and ylab. The first two arguments are variables to be plotted. The third is the number of points to be randomly sampled, with a default of 500. (Hence, if x and y are vectors of length, say, 5000, then a random 500 of the total 5000 x-y pairs will be plotted, by default.) The last two are the x and y labels, both with defaults of “” (the empty string)
plot.sample<-function(x,y,nsample, xlab, ylab){
#Check the length of two input x and y
if(length(x)==length(y)){
c<-data.frame(x,y)
#Check if the input sample size exceed total number of data points
if(nsample>nrow(c)){
stop("Sample exceed the total number of data points ")
}
else c<-data.frame(x,y)
set.seed(300)
c<-c[sample(seq_len(nrow(c)), nsample), ]
plot(c,xlab = xlab,ylab=ylab)
}
#Check if the length of two input x and y is different
if(length(x)!=length(y)){
stop("Not the same length")
}
}
plot.sample(x=census$Med_HHD_Inc_ACS_09_13,y=census$Med_House_value_ACS_09_13,nsample = 500, xlab="Median HHD income", ylab="Median house value")
write a function add.lineartrend.info(), which is designed to be called after plot.sample(). This takes as input x, y for variables that already appear on the x- and y-axes, and does two things:
• adds the best fit line (produced lm()) function in red, and with twice the default width (using the lwd=2 option);
• adds a title to the plot with the numeric correlation of the two variables, using 3 significant digits.
add.lineartrend.info<-function(x,y){
c<-cbind(x,y)
set.seed(300)
c<-c[sample(seq_len(nrow(c)), 500), ]
c<-data.frame(c)
reg<-lm(y~x,data=c)
corr<-cor(c$x,c$y)
abline(reg, col = "darkgreen",lwd=2)
title(main="")
text(min(c$x), max(c$y) ,bquote(italic('r=')~.(round(corr, 3))), adj=0, cex=0.8,col="red")
}
#Plot scatter plot
plot.sample(x=census$Med_HHD_Inc_ACS_09_13,y=census$Med_House_value_ACS_09_13,nsample = 500, xlab="Median HHD income", ylab="Median house value")
#Plot regression line
add.lineartrend.info(x=census$Med_HHD_Inc_ACS_09_13,y=census$Med_House_value_ACS_09_13)
Write a function plot.all() which takes as input dataset and nsample, a data frame, and the number of points to be sampled (with default 500). This function will mimick the behavior of plot() when applied to data frames. In other words, if dataset has p columns, then plot.all() should produce a p by p retangular grid of plots, where the plot in entry i, j of the grid uses column i for the x-axis data, and column j for the y-axis data.
plot.all = function(dataset, nsample=500) {
p = ncol(dataset)
orig.par = par()
# Set the margins, and plotting grid
par(mar=c(2,2,2,2))
par(mfrow=c(p,p))
# TODO: your plotting code goes here
set.seed(300)
dataset<-dataset[sample(seq_len(nrow(dataset)), nsample), ]
for (i in 1:ncol(dataset)){
for(j in 1:ncol(dataset)){
colname<-colnames(mydat)
plot.sample(x=dataset[,i],y=dataset[,j],nsample = 500, xlab="", ylab="")
text(max(dataset[,i]),y=max(dataset[,j]),colname[i],col="blue",cex=0.6,font=4)
text(max(dataset[,i]),y=min(dataset[,j]),colname[j],col="red",cex=0.6,font=4)
add.lineartrend.info(x=dataset[,i],y=dataset[,j])
}
}
# Reset the margins, and plotting grid
par(mar=orig.par$mar)
par(mfrow=orig.par$mfrow)
}
mydat = census[,c("Med_HHD_Inc_ACS_09_13", "Med_House_value_ACS_09_13","pct_College_ACS_09_13","Mail_Return_Rate_CEN_2010")]
plot.all(mydat)
This relationship does make sense to me.