# Set the global option 

knitr::opts_chunk$set(message = F, warning = F, collapse = T, echo = T, results = 'markup') 

Suppose that your first job out of college turns out to be with the Lakes Region Planning Commission in New Hampshire. The director asked you to write a report on the economic status of the Lakes Region towns by answering the questions below.

How to import data

As a proxy for the entrepreneurial activity, you will use town-level (county subdivision, to be more accurate) personal income data from the U.S. Census: American Community Survey, 5-year estimates over 2012-2016. The data is posted in Moodle. It is titled as “Exploratory Data Analysis: census.csv” under Data Files.

To learn how to import data, watch a brief tutorial video posted in Moodle: it is titled as “How to import data files into R” under Course Resources.


# Load packages 

library(ggplot2) 

library(dplyr) 

 

# Import data 

census <- read.csv("~/resources/rstudio/Exploratory Data Analysis - census.csv") 

 

str(census) 
## 'data.frame':    30 obs. of  39 variables:
##  $ GEOID                         : num  3.3e+09 3.3e+09 3.3e+09 3.3e+09 3.3e+09 ...
##  $ NAME                          : Factor w/ 30 levels "Alexandria town, Grafton County, New Hampshire",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ popTotal                      : int  1876 5285 2658 2159 4631 7281 987 3050 1011 1294 ...
##  $ medianAge                     : num  45 45.8 46.5 37.8 41.8 43.8 57.6 41.4 51.5 44.6 ...
##  $ popNative_USA                 : int  1833 4975 2554 2094 4518 7223 966 3003 989 1268 ...
##  $ popNative                     : int  1835 5041 2573 2142 4533 7236 976 3026 989 1279 ...
##  $ popNaitve_NH                  : int  1008 2450 1418 1008 2273 4420 425 1611 384 659 ...
##  $ popMoved_otherState           : int  10 72 13 41 37 60 31 84 43 6 ...
##  $ popMoved_abroad               : int  5 0 13 0 0 11 0 9 0 0 ...
##  $ popCommute_car                : int  755 2298 1178 993 2537 3615 383 1381 524 696 ...
##  $ popCommute_publicT            : int  0 0 0 0 13 0 0 0 0 11 ...
##  $ popCommute_bicycle            : int  0 0 19 0 0 0 0 0 0 0 ...
##  $ popCommute_foot               : int  6 24 36 143 33 0 0 96 4 0 ...
##  $ popCommute_other              : int  14 12 5 8 0 54 0 6 3 3 ...
##  $ popCommute_home               : int  29 277 182 24 79 39 44 48 49 43 ...
##  $ popBA                         : int  199 1274 742 531 783 928 271 484 374 209 ...
##  $ popPov                        : int  159 305 176 457 234 620 45 323 79 32 ...
##  $ medianIncome                  : int  56597 76676 69489 44000 70037 60938 53958 54015 70625 61058 ...
##  $ earningsAgg                   : int  31933300 127223500 65678000 36025500 118527900 150122700 21898000 54045100 27997300 31278900 ...
##  $ earningsAgg_WageSalary        : int  29137500 118023200 57790900 34825100 112078200 142960400 19791000 51913800 23983000 26628900 ...
##  $ earningsAgg_SelfEmpl          : int  2795800 9200300 7887100 1200400 6449800 7162300 2106900 2131300 4014200 4650000 ...
##  $ LF                            : int  905 2775 1559 1232 2905 3904 459 1618 611 794 ...
##  $ LF_Civilian                   : int  905 2775 1559 1232 2905 3904 459 1610 611 794 ...
##  $ LF_Civilian_Unemployed        : int  68 120 129 59 180 174 21 40 31 28 ...
##  $ LF_Not                        : int  631 1526 689 590 992 2117 442 875 313 267 ...
##  $ housingTotal                  : int  960 4437 1251 1445 2415 3610 1033 2333 785 735 ...
##  $ housingVacant_rent            : int  0 38 0 28 0 83 0 0 0 0 ...
##  $ housingVacant_seasonal        : int  259 2123 151 326 618 585 529 1017 331 115 ...
##  $ medianHomeValue               : int  178800 279000 208000 155600 177500 186100 253200 205600 334000 165900 ...
##  $ medianGrossRent               : int  869 798 1125 834 1068 892 1027 824 1083 941 ...
##  $ unemplRate                    : num  7.51 4.32 8.27 4.79 6.2 ...
##  $ LFparticipationRate           : num  58.9 64.5 69.4 67.6 74.5 ...
##  $ housing_percentOfseasonal     : num  27 47.8 12.1 22.6 25.6 ...
##  $ housing_percentOfrent         : num  0 0.856 0 1.938 0 ...
##  $ earnings_percenOfcurrentlabor : num  100 100 100 100 100 ...
##  $ popPov_percent                : num  8.48 5.77 6.62 21.17 5.05 ...
##  $ popBA_percent                 : num  10.6 24.1 27.9 24.6 16.9 ...
##  $ housingVacant_seasonal_percent: num  27 47.8 12.1 22.6 25.6 ...
##  $ housingVacant_rent_percent    : num  0 0.856 0 1.938 0 ...

 

summary(census) 
##      GEOID                                                       NAME   
##  Min.   :3.300e+09   Alexandria town, Grafton County, New Hampshire: 1  
##  1st Qu.:3.300e+09   Alton town, Belknap County, New Hampshire     : 1  
##  Median :3.300e+09   Andover town, Merrimack County, New Hampshire : 1  
##  Mean   :3.301e+09   Ashland town, Grafton County, New Hampshire   : 1  
##  3rd Qu.:3.301e+09   Barnstead town, Belknap County, New Hampshire : 1  
##  Max.   :3.301e+09   Belmont town, Belknap County, New Hampshire   : 1  
##                      (Other)                                       :24  
##     popTotal       medianAge     popNative_USA     popNative    
##  Min.   :  582   Min.   :37.80   Min.   :  550   Min.   :  557  
##  1st Qu.: 1483   1st Qu.:44.77   1st Qu.: 1452   1st Qu.: 1457  
##  Median : 2934   Median :48.75   Median : 2865   Median : 2889  
##  Mean   : 3774   Mean   :48.59   Mean   : 3645   Mean   : 3669  
##  3rd Qu.: 4774   3rd Qu.:53.00   3rd Qu.: 4576   3rd Qu.: 4615  
##  Max.   :16171   Max.   :59.10   Max.   :15511   Max.   :15613  
##                                                                 
##   popNaitve_NH    popMoved_otherState popMoved_abroad  popCommute_car  
##  Min.   : 202.0   Min.   :  6.00      Min.   : 0.000   Min.   : 234.0  
##  1st Qu.: 698.5   1st Qu.: 23.25      1st Qu.: 0.000   1st Qu.: 710.8  
##  Median :1316.0   Median : 42.00      Median : 0.000   Median :1397.5  
##  Mean   :1881.0   Mean   : 77.80      Mean   : 6.467   Mean   :1670.0  
##  3rd Qu.:2405.8   3rd Qu.: 87.25      3rd Qu.:10.500   3rd Qu.:2273.0  
##  Max.   :9378.0   Max.   :470.00      Max.   :49.000   Max.   :7069.0  
##                                                                        
##  popCommute_publicT popCommute_bicycle popCommute_foot popCommute_other
##  Min.   : 0.0       Min.   : 0.000     Min.   :  0.0   Min.   :  0.00  
##  1st Qu.: 0.0       1st Qu.: 0.000     1st Qu.:  6.5   1st Qu.:  3.50  
##  Median : 0.0       Median : 0.000     Median : 15.0   Median : 13.50  
##  Mean   : 3.9       Mean   : 2.567     Mean   : 34.6   Mean   : 35.47  
##  3rd Qu.: 0.0       3rd Qu.: 0.000     3rd Qu.: 34.0   3rd Qu.: 51.00  
##  Max.   :72.0       Max.   :27.000     Max.   :238.0   Max.   :225.00  
##                                                                        
##  popCommute_home      popBA            popPov        medianIncome  
##  Min.   : 21.00   Min.   : 159.0   Min.   :  30.0   Min.   :44000  
##  1st Qu.: 36.00   1st Qu.: 401.5   1st Qu.: 146.2   1st Qu.:52204  
##  Median : 58.50   Median : 651.0   Median : 215.0   Median :61438  
##  Mean   : 93.07   Mean   : 833.0   Mean   : 390.8   Mean   :59857  
##  3rd Qu.:134.00   3rd Qu.: 973.8   3rd Qu.: 440.5   3rd Qu.:68043  
##  Max.   :360.00   Max.   :2995.0   Max.   :2557.0   Max.   :76676  
##                                                                    
##   earningsAgg        earningsAgg_WageSalary earningsAgg_SelfEmpl
##  Min.   : 11992400   Min.   : 10745100      Min.   : 1200400    
##  1st Qu.: 31442500   1st Qu.: 27256050      1st Qu.: 3072075    
##  Median : 67989550   Median : 60202850      Median : 6436850    
##  Mean   : 84895040   Mean   : 77478727      Mean   : 7416153    
##  3rd Qu.:117084150   3rd Qu.:110519975      3rd Qu.: 9046525    
##  Max.   :353736200   Max.   :325047100      Max.   :28684400    
##                                                                 
##        LF          LF_Civilian     LF_Civilian_Unemployed     LF_Not      
##  Min.   : 288.0   Min.   : 288.0   Min.   : 17.00         Min.   : 233.0  
##  1st Qu.: 821.8   1st Qu.: 821.8   1st Qu.: 40.75         1st Qu.: 530.8  
##  Median :1647.5   Median :1643.5   Median : 68.00         Median : 842.5  
##  Mean   :1979.8   Mean   :1978.9   Mean   :106.07         Mean   :1184.5  
##  3rd Qu.:2739.5   3rd Qu.:2739.5   3rd Qu.:120.00         3rd Qu.:1513.8  
##  Max.   :8468.0   Max.   :8461.0   Max.   :635.00         Max.   :5018.0  
##                                                                           
##   housingTotal  housingVacant_rent housingVacant_seasonal medianHomeValue 
##  Min.   : 498   Min.   :  0.0      Min.   :  27.0         Min.   :155600  
##  1st Qu.:1054   1st Qu.:  0.0      1st Qu.: 259.0         1st Qu.:178275  
##  Median :1826   Median :  0.0      Median : 520.5         Median :206800  
##  Mean   :2501   Mean   : 19.7      Mean   : 798.9         Mean   :226277  
##  3rd Qu.:3486   3rd Qu.: 13.5      3rd Qu.:1189.8         3rd Qu.:263650  
##  Max.   :9945   Max.   :208.0      Max.   :2999.0         Max.   :334000  
##                                                                           
##  medianGrossRent    unemplRate     LFparticipationRate
##  Min.   : 787.0   Min.   : 1.310   Min.   :45.93      
##  1st Qu.: 849.0   1st Qu.: 3.835   1st Qu.:57.72      
##  Median : 941.0   Median : 4.566   Median :63.42      
##  Mean   : 960.7   Mean   : 5.241   Mean   :62.30      
##  3rd Qu.:1027.0   3rd Qu.: 6.912   3rd Qu.:66.53      
##  Max.   :1223.0   Max.   :10.272   Max.   :74.84      
##  NA's   :1                                            
##  housing_percentOfseasonal housing_percentOfrent
##  Min.   : 1.477            Min.   :0.0000       
##  1st Qu.:18.398            1st Qu.:0.0000       
##  Median :29.462            Median :0.0000       
##  Mean   :30.980            Mean   :0.5421       
##  3rd Qu.:41.164            3rd Qu.:0.6900       
##  Max.   :68.020            Max.   :5.0510       
##                                                 
##  earnings_percenOfcurrentlabor popPov_percent   popBA_percent  
##  Min.   :100                   Min.   : 2.473   Min.   :10.61  
##  1st Qu.:100                   1st Qu.: 5.894   1st Qu.:15.94  
##  Median :100                   Median : 8.145   Median :24.35  
##  Mean   :100                   Mean   : 9.077   Mean   :23.81  
##  3rd Qu.:100                   3rd Qu.:10.869   3rd Qu.:29.02  
##  Max.   :100                   Max.   :21.167   Max.   :43.13  
##                                                                
##  housingVacant_seasonal_percent housingVacant_rent_percent
##  Min.   : 1.477                 Min.   :0.0000            
##  1st Qu.:18.398                 1st Qu.:0.0000            
##  Median :29.462                 Median :0.0000            
##  Mean   :30.980                 Mean   :0.5421            
##  3rd Qu.:41.164                 3rd Qu.:0.6900            
##  Max.   :68.020                 Max.   :5.0510            
## 

Q1 How many Lakes Region towns are there in the data set?

Hint: The row represents towns. The str() function returns number of observations (rows).

There are 30 Lakes Region towns in the data set.

Q2 What’s the highest unemployment rate (unemplRate) reported by any town in the Lakes Region?

Hint: The summary() function returns summary statistics, including the maximum value.

10.272 is the highest reported unemployment rate.

Q3 What is the unemployment rate (unemplRate) of a typical town? Did you choose mean or median? Explain your rationale.

Hint: A typical value can be derived from one of two measures of center: mean and median. The median is a more appropriate measure of center in the presence of extreme values as the mean is sensitive to extreme values. See a student’s work in the past semester as an example. Click the link here.

I would choose the median which is 4.566 because median is a better measure of unemployment becuase it takes the actual middle value versus the average which can be affected by outliers.


# Create faceted histogram 

ggplot(census, aes(x = housing_percentOfseasonal)) + 

  geom_histogram() 


 

# Create box plots of city mpg by UR_aboveAve 

ggplot(census, aes(x = 1, y = housing_percentOfseasonal)) + 

  geom_boxplot() 


 

# Create overlaid density plots for same data 

ggplot(census, aes(x = housing_percentOfseasonal)) + 

  geom_density(alpha = .3) 


# If data has extreme values 

census %>% 

  summarize(median = median(housing_percentOfseasonal, na.rm = TRUE), 

            IQR = IQR(housing_percentOfseasonal, na.rm = TRUE)) 
##     median      IQR
## 1 29.46153 22.76686

 

# If data doesn't have extreme values 

census %>% 

  summarize(mean = mean(housing_percentOfseasonal, na.rm = TRUE), 

            sd = sd(housing_percentOfseasonal, na.rm = TRUE)) 
##       mean       sd
## 1 30.97958 16.97828

Suppose that the director suspects that the unemployment rate (unemplRate) may be associated with the educational level of residents (popBA_percent). Divide the towns into two groups: 1) educated towns (the share of population with Bachelor’s degree equal to or higher than the average) and 2) other less educated towns (the share of population with Bachelor’s degree lower than the average).

Q4 What is the unemployment rate of a typical educated town?

The unemployment rate of an educated town is 4.51.

Q5 What is the unemployment rate of a typical less educated town?

The unemployment rate of an uneducated town is 4.76.

Q6 By comparing the median, would you conclude that an educated town is more likely to have a lower unemployment rate than a less educated town is?

By comparing the median, an educated town is more likely to have a lower umemployment rate than an undeucated town.

Q7 Does this mean that there is not a single less educated town that has the unemployment rate lower than any educated towns? Explain using the box plot.

According to the boxplot the highest unemployment rate of an educated town is just over 10 but also the lowest unemployment rate is just below 2.5 so it is possible to have a less educated town with a lower unemployment rate than an educated town.


# Create a new variable, UR > or < average 

UR_ave <- mean(census$popBA_percent) 

 

census$UR_aboveAve <- ifelse(census$popBA_percent >= UR_ave, "equal or above ave", "below ave") 

 

# Create box plots of total population by UR_aboveAve 

ggplot(census, aes(x = UR_aboveAve, y = unemplRate)) + 

  geom_boxplot() 


 

# If data has extreme values 

census %>% 

  group_by(UR_aboveAve) %>% 

  summarize(median = median(unemplRate, na.rm = TRUE), 

            IQR = IQR(unemplRate, na.rm = TRUE)) 
## # A tibble: 2 x 3
##   UR_aboveAve        median   IQR
##   <chr>               <dbl> <dbl>
## 1 below ave            4.76  3.77
## 2 equal or above ave   4.51  2.16

 

# If data doesn't have extreme values 

census %>% 

  group_by(UR_aboveAve) %>% 

  summarize(mean = mean(unemplRate, na.rm = TRUE), 

            sd = sd(unemplRate, na.rm = TRUE)) 
## # A tibble: 2 x 3
##   UR_aboveAve         mean    sd
##   <chr>              <dbl> <dbl>
## 1 below ave           5.38  2.46
## 2 equal or above ave  5.12  2.28