library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(dplyr)
library(ggplot2)
There are three datasets in this file: 1- Census data 2- Economic Research Service -U.S.Department of Agriculture ERS-USDA data 3- FIPS Code data In this assignment, my aim is to perform data cleaning on all three data set, after initial analysis,merge thre three datasets on FIPS Code column and then find the co-relation between the variables and perform final analysis
library(readr)
dataset1 <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_607_Project2/main/census_us_county_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## County = col_character(),
## Name = col_character(),
## Population = col_double(),
## `Median Age` = col_double(),
## `Household Income` = col_double(),
## `Per Capita Income` = col_double(),
## `Poverty Count` = col_double(),
## `Poverty Rate` = col_double(),
## `Unemployment Rate` = col_double()
## )
head(dataset1)
## # A tibble: 6 x 9
## County Name Population `Median Age` `Household Inco~ `Per Capita Inco~
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 051 Fayette Cou~ 21565 41.9 46650 23194
## 2 107 Logan Count~ 29003 40.1 57308 27546
## 3 165 Saline Coun~ 23994 42.2 44090 25342
## 4 097 Lake County~ 701473 38.4 89427 45766
## 5 127 Massac Coun~ 14219 43.5 47481 23539
## 6 017 Cass County~ 12493 40 52373 26992
## # ... with 3 more variables: Poverty Count <dbl>, Poverty Rate <dbl>,
## # Unemployment Rate <dbl>
names(dataset1) <- tolower(names(dataset1)) # Convert colnames to lower case
dataset1
## # A tibble: 3,220 x 9
## county name population `median age` `household incom~ `per capita inc~
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 051 Fayette Co~ 21565 41.9 46650 23194
## 2 107 Logan Coun~ 29003 40.1 57308 27546
## 3 165 Saline Cou~ 23994 42.2 44090 25342
## 4 097 Lake Count~ 701473 38.4 89427 45766
## 5 127 Massac Cou~ 14219 43.5 47481 23539
## 6 017 Cass Count~ 12493 40 52373 26992
## 7 069 Huntington~ 36359 40.7 53632 26502
## 8 181 White Coun~ 24149 42 54576 27461
## 9 075 Jay County~ 20840 39.7 47658 23443
## 10 145 Shelby Cou~ 44438 40.8 60404 29583
## # ... with 3,210 more rows, and 3 more variables: poverty count <dbl>,
## # poverty rate <dbl>, unemployment rate <dbl>
dataset1 <- dataset1 %>% separate(name, c("county", "state"), ",")
dataset1
## # A tibble: 3,220 x 9
## population county state `median age` `household incom~ `per capita inc~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 21565 Fayette C~ " Illi~ 41.9 46650 23194
## 2 29003 Logan Cou~ " Illi~ 40.1 57308 27546
## 3 23994 Saline Co~ " Illi~ 42.2 44090 25342
## 4 701473 Lake Coun~ " Illi~ 38.4 89427 45766
## 5 14219 Massac Co~ " Illi~ 43.5 47481 23539
## 6 12493 Cass Coun~ " Illi~ 40 52373 26992
## 7 36359 Huntingto~ " Indi~ 40.7 53632 26502
## 8 24149 White Cou~ " Indi~ 42 54576 27461
## 9 20840 Jay County " Indi~ 39.7 47658 23443
## 10 44438 Shelby Co~ " Indi~ 40.8 60404 29583
## # ... with 3,210 more rows, and 3 more variables: poverty count <dbl>,
## # poverty rate <dbl>, unemployment rate <dbl>
dataset1 <- dataset1 %>% distinct()
dataset1
## # A tibble: 3,220 x 9
## population county state `median age` `household incom~ `per capita inc~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 21565 Fayette C~ " Illi~ 41.9 46650 23194
## 2 29003 Logan Cou~ " Illi~ 40.1 57308 27546
## 3 23994 Saline Co~ " Illi~ 42.2 44090 25342
## 4 701473 Lake Coun~ " Illi~ 38.4 89427 45766
## 5 14219 Massac Co~ " Illi~ 43.5 47481 23539
## 6 12493 Cass Coun~ " Illi~ 40 52373 26992
## 7 36359 Huntingto~ " Indi~ 40.7 53632 26502
## 8 24149 White Cou~ " Indi~ 42 54576 27461
## 9 20840 Jay County " Indi~ 39.7 47658 23443
## 10 44438 Shelby Co~ " Indi~ 40.8 60404 29583
## # ... with 3,210 more rows, and 3 more variables: poverty count <dbl>,
## # poverty rate <dbl>, unemployment rate <dbl>
names(dataset1)[4:9] <- c('median_age', 'household_income', 'per_capita_income', 'poverty_count',
'poverty_rate', 'unemployment_rate')
dataset1
## # A tibble: 3,220 x 9
## population county state median_age household_income per_capita_inco~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 21565 Fayette Cou~ " Illin~ 41.9 46650 23194
## 2 29003 Logan County " Illin~ 40.1 57308 27546
## 3 23994 Saline Coun~ " Illin~ 42.2 44090 25342
## 4 701473 Lake County " Illin~ 38.4 89427 45766
## 5 14219 Massac Coun~ " Illin~ 43.5 47481 23539
## 6 12493 Cass County " Illin~ 40 52373 26992
## 7 36359 Huntington ~ " India~ 40.7 53632 26502
## 8 24149 White County " India~ 42 54576 27461
## 9 20840 Jay County " India~ 39.7 47658 23443
## 10 44438 Shelby Coun~ " India~ 40.8 60404 29583
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>
dataset1$county <-gsub(" County","",as.character(dataset1$county))
dataset1
## # A tibble: 3,220 x 9
## population county state median_age household_income per_capita_inco~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 21565 Fayette " Illinoi~ 41.9 46650 23194
## 2 29003 Logan " Illinoi~ 40.1 57308 27546
## 3 23994 Saline " Illinoi~ 42.2 44090 25342
## 4 701473 Lake " Illinoi~ 38.4 89427 45766
## 5 14219 Massac " Illinoi~ 43.5 47481 23539
## 6 12493 Cass " Illinoi~ 40 52373 26992
## 7 36359 Huntington " Indiana" 40.7 53632 26502
## 8 24149 White " Indiana" 42 54576 27461
## 9 20840 Jay " Indiana" 39.7 47658 23443
## 10 44438 Shelby " Indiana" 40.8 60404 29583
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>
glimpse(dataset1)
## Rows: 3,220
## Columns: 9
## $ population <dbl> 21565, 29003, 23994, 701473, 14219, 12493, 36359, 24~
## $ county <chr> "Fayette", "Logan", "Saline", "Lake", "Massac", "Cas~
## $ state <chr> " Illinois", " Illinois", " Illinois", " Illinois", ~
## $ median_age <dbl> 41.9, 40.1, 42.2, 38.4, 43.5, 40.0, 40.7, 42.0, 39.7~
## $ household_income <dbl> 46650, 57308, 44090, 89427, 47481, 52373, 53632, 545~
## $ per_capita_income <dbl> 23194, 27546, 25342, 45766, 23539, 26992, 26502, 274~
## $ poverty_count <dbl> 3421, 2323, 4936, 54273, 2331, 1822, 3809, 2198, 288~
## $ poverty_rate <dbl> 15.863668, 8.009516, 20.571810, 7.737005, 16.393558,~
## $ unemployment_rate <dbl> 2.4345003, 2.5445644, 3.4008502, 2.7594790, 1.821506~
str(dataset1)
## tibble [3,220 x 9] (S3: tbl_df/tbl/data.frame)
## $ population : num [1:3220] 21565 29003 23994 701473 14219 ...
## $ county : chr [1:3220] "Fayette" "Logan" "Saline" "Lake" ...
## $ state : chr [1:3220] " Illinois" " Illinois" " Illinois" " Illinois" ...
## $ median_age : num [1:3220] 41.9 40.1 42.2 38.4 43.5 40 40.7 42 39.7 40.8 ...
## $ household_income : num [1:3220] 46650 57308 44090 89427 47481 ...
## $ per_capita_income: num [1:3220] 23194 27546 25342 45766 23539 ...
## $ poverty_count : num [1:3220] 3421 2323 4936 54273 2331 ...
## $ poverty_rate : num [1:3220] 15.86 8.01 20.57 7.74 16.39 ...
## $ unemployment_rate: num [1:3220] 2.43 2.54 3.4 2.76 1.82 ...
dataset1$state <- trimws(dataset1$state, which = c("left"))
str(dataset1)
## tibble [3,220 x 9] (S3: tbl_df/tbl/data.frame)
## $ population : num [1:3220] 21565 29003 23994 701473 14219 ...
## $ county : chr [1:3220] "Fayette" "Logan" "Saline" "Lake" ...
## $ state : chr [1:3220] "Illinois" "Illinois" "Illinois" "Illinois" ...
## $ median_age : num [1:3220] 41.9 40.1 42.2 38.4 43.5 40 40.7 42 39.7 40.8 ...
## $ household_income : num [1:3220] 46650 57308 44090 89427 47481 ...
## $ per_capita_income: num [1:3220] 23194 27546 25342 45766 23539 ...
## $ poverty_count : num [1:3220] 3421 2323 4936 54273 2331 ...
## $ poverty_rate : num [1:3220] 15.86 8.01 20.57 7.74 16.39 ...
## $ unemployment_rate: num [1:3220] 2.43 2.54 3.4 2.76 1.82 ...
# sort by state and county
# sort the dataframe in R using arrange
dataset1 <- arrange(dataset1,state, county)
dataset1
## # A tibble: 3,220 x 9
## population county state median_age household_income per_capita_income
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 55380 Autauga Alabama 38.2 58731 29819
## 2 212830 Baldwin Alabama 43 58320 32626
## 3 25361 Barbour Alabama 40.4 32525 18473
## 4 22493 Bibb Alabama 40.9 47542 20778
## 5 57681 Blount Alabama 40.7 49358 24747
## 6 10248 Bullock Alabama 40.2 37785 20877
## 7 19828 Butler Alabama 40.8 40688 21038
## 8 114618 Calhoun Alabama 39.6 47255 25345
## 9 33660 Chambers Alabama 42 42289 22729
## 10 25903 Cherokee Alabama 46.5 41919 24301
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>
dataset1[is.na(dataset1)] <- 0
dataset1
## # A tibble: 3,220 x 9
## population county state median_age household_income per_capita_income
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 55380 Autauga Alabama 38.2 58731 29819
## 2 212830 Baldwin Alabama 43 58320 32626
## 3 25361 Barbour Alabama 40.4 32525 18473
## 4 22493 Bibb Alabama 40.9 47542 20778
## 5 57681 Blount Alabama 40.7 49358 24747
## 6 10248 Bullock Alabama 40.2 37785 20877
## 7 19828 Butler Alabama 40.8 40688 21038
## 8 114618 Calhoun Alabama 39.6 47255 25345
## 9 33660 Chambers Alabama 42 42289 22729
## 10 25903 Cherokee Alabama 46.5 41919 24301
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>
Step 1: Select data frame Step 2: Group data Step 3: Summarize the data Step 4: Plot the summary statistics
library(ggplot2)
# Step 1
dataset1 %>%
#Step 2
group_by(state)
## # A tibble: 3,220 x 9
## # Groups: state [52]
## population county state median_age household_income per_capita_income
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 55380 Autauga Alabama 38.2 58731 29819
## 2 212830 Baldwin Alabama 43 58320 32626
## 3 25361 Barbour Alabama 40.4 32525 18473
## 4 22493 Bibb Alabama 40.9 47542 20778
## 5 57681 Blount Alabama 40.7 49358 24747
## 6 10248 Bullock Alabama 40.2 37785 20877
## 7 19828 Butler Alabama 40.8 40688 21038
## 8 114618 Calhoun Alabama 39.6 47255 25345
## 9 33660 Chambers Alabama 42 42289 22729
## 10 25903 Cherokee Alabama 46.5 41919 24301
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>
# First filter the state Alabama and selecting the columns
dataset1_alabama <- dataset1 %>% filter(state == "Alabama") %>%
select(c(3:9, 1))
dataset1_alabama
## # A tibble: 67 x 8
## state median_age household_income per_capita_inco~ poverty_count poverty_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alab~ 38.2 58731 29819 8340 15.1
## 2 Alab~ 43 58320 32626 21704 10.2
## 3 Alab~ 40.4 32525 18473 6875 27.1
## 4 Alab~ 40.9 47542 20778 3740 16.6
## 5 Alab~ 40.7 49358 24747 7739 13.4
## 6 Alab~ 40.2 37785 20877 2825 27.6
## 7 Alab~ 40.8 40688 21038 4397 22.2
## 8 Alab~ 39.6 47255 25345 19969 17.4
## 9 Alab~ 42 42289 22729 5711 17.0
## 10 Alab~ 46.5 41919 24301 3560 13.7
## # ... with 57 more rows, and 2 more variables: unemployment_rate <dbl>,
## # population <dbl>
str(dataset1_alabama)
## tibble [67 x 8] (S3: tbl_df/tbl/data.frame)
## $ state : chr [1:67] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ median_age : num [1:67] 38.2 43 40.4 40.9 40.7 40.2 40.8 39.6 42 46.5 ...
## $ household_income : num [1:67] 58731 58320 32525 47542 49358 ...
## $ per_capita_income: num [1:67] 29819 32626 18473 20778 24747 ...
## $ poverty_count : num [1:67] 8340 21704 6875 3740 7739 ...
## $ poverty_rate : num [1:67] 15.1 10.2 27.1 16.6 13.4 ...
## $ unemployment_rate: num [1:67] 1.69 1.99 3.35 2.93 1.32 ...
## $ population : num [1:67] 55380 212830 25361 22493 57681 ...
data1_al <- dataset1_alabama %>%
pivot_longer(cols = c(2:8),
names_to = "Variable",
values_to = "Value")
data1_al
## # A tibble: 469 x 3
## state Variable Value
## <chr> <chr> <dbl>
## 1 Alabama median_age 38.2
## 2 Alabama household_income 58731
## 3 Alabama per_capita_income 29819
## 4 Alabama poverty_count 8340
## 5 Alabama poverty_rate 15.1
## 6 Alabama unemployment_rate 1.69
## 7 Alabama population 55380
## 8 Alabama median_age 43
## 9 Alabama household_income 58320
## 10 Alabama per_capita_income 32626
## # ... with 459 more rows
bar_plot <- ggplot(data = data1_al, aes(x = Variable, y = Value, fill = Variable)) +
geom_bar(stat = "identity") + coord_flip() +
labs(
x = "Variables",
y = "Values",
title = paste(
"Census data for Alabama"
)
)
bar_plot
dataset2 <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_607_Project2/main/ers_usda_education_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## State = col_character(),
## `Area name` = col_character(),
## `Less than a high school diploma, 1970` = col_number(),
## `High school diploma only, 1970` = col_number(),
## `Some college (1-3 years), 1970` = col_number(),
## `Four years of college or higher, 1970` = col_number(),
## `Less than a high school diploma, 1980` = col_number(),
## `High school diploma only, 1980` = col_number(),
## `Some college (1-3 years), 1980` = col_number(),
## `Four years of college or higher, 1980` = col_number(),
## `Less than a high school diploma, 1990` = col_number(),
## `High school diploma only, 1990` = col_number(),
## `Some college or associate's degree, 1990` = col_number(),
## `Bachelor's degree or higher, 1990` = col_number(),
## `Less than a high school diploma, 2000` = col_number(),
## `High school diploma only, 2000` = col_number(),
## `Some college or associate's degree, 2000` = col_number(),
## `Bachelor's degree or higher, 2000` = col_number(),
## `Less than a high school diploma, 2015-19` = col_number(),
## `High school diploma only, 2015-19` = col_number()
## # ... with 2 more columns
## )
## i Use `spec()` for the full column specifications.
head(dataset2)
## # A tibble: 6 x 47
## `FIPS Code` State `Area name` `2003 Rural-urban Conti~ `2003 Urban Influenc~
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 0 US United States NA NA
## 2 1000 AL Alabama NA NA
## 3 1001 AL Autauga Coun~ 2 2
## 4 1003 AL Baldwin Coun~ 4 5
## 5 1005 AL Barbour Coun~ 6 6
## 6 1007 AL Bibb County 1 1
## # ... with 42 more variables: 2013 Rural-urban Continuum Code <dbl>,
## # 2013 Urban Influence Code <dbl>,
## # Less than a high school diploma, 1970 <dbl>,
## # High school diploma only, 1970 <dbl>, Some college (1-3 years), 1970 <dbl>,
## # Four years of college or higher, 1970 <dbl>,
## # Percent of adults with less than a high school diploma, 1970 <dbl>,
## # Percent of adults with a high school diploma only, 1970 <dbl>,
## # Percent of adults completing some college (1-3 years), 1970 <dbl>,
## # Percent of adults completing four years of college or higher, 1970 <dbl>,
## # Less than a high school diploma, 1980 <dbl>,
## # High school diploma only, 1980 <dbl>, Some college (1-3 years), 1980 <dbl>,
## # Four years of college or higher, 1980 <dbl>,
## # Percent of adults with less than a high school diploma, 1980 <dbl>,
## # Percent of adults with a high school diploma only, 1980 <dbl>,
## # Percent of adults completing some college (1-3 years), 1980 <dbl>,
## # Percent of adults completing four years of college or higher, 1980 <dbl>,
## # Less than a high school diploma, 1990 <dbl>,
## # High school diploma only, 1990 <dbl>,
## # Some college or associate's degree, 1990 <dbl>,
## # Bachelor's degree or higher, 1990 <dbl>,
## # Percent of adults with less than a high school diploma, 1990 <dbl>,
## # Percent of adults with a high school diploma only, 1990 <dbl>,
## # Percent of adults completing some college or associate's degree, 1990 <dbl>,
## # Percent of adults with a bachelor's degree or higher, 1990 <dbl>,
## # Less than a high school diploma, 2000 <dbl>,
## # High school diploma only, 2000 <dbl>,
## # Some college or associate's degree, 2000 <dbl>,
## # Bachelor's degree or higher, 2000 <dbl>,
## # Percent of adults with less than a high school diploma, 2000 <dbl>,
## # Percent of adults with a high school diploma only, 2000 <dbl>,
## # Percent of adults completing some college or associate's degree, 2000 <dbl>,
## # Percent of adults with a bachelor's degree or higher, 2000 <dbl>,
## # Less than a high school diploma, 2015-19 <dbl>,
## # High school diploma only, 2015-19 <dbl>,
## # Some college or associate's degree, 2015-19 <dbl>,
## # Bachelor's degree or higher, 2015-19 <dbl>,
## # Percent of adults with less than a high school diploma, 2015-19 <dbl>,
## # Percent of adults with a high school diploma only, 2015-19 <dbl>,
## # Percent of adults completing some college or associate's degree, 2015-19 <dbl>,
## # Percent of adults with a bachelor's degree or higher, 2015-19 <dbl>
colnames(dataset2)
## [1] "FIPS Code"
## [2] "State"
## [3] "Area name"
## [4] "2003 Rural-urban Continuum Code"
## [5] "2003 Urban Influence Code"
## [6] "2013 Rural-urban Continuum Code"
## [7] "2013 Urban Influence Code"
## [8] "Less than a high school diploma, 1970"
## [9] "High school diploma only, 1970"
## [10] "Some college (1-3 years), 1970"
## [11] "Four years of college or higher, 1970"
## [12] "Percent of adults with less than a high school diploma, 1970"
## [13] "Percent of adults with a high school diploma only, 1970"
## [14] "Percent of adults completing some college (1-3 years), 1970"
## [15] "Percent of adults completing four years of college or higher, 1970"
## [16] "Less than a high school diploma, 1980"
## [17] "High school diploma only, 1980"
## [18] "Some college (1-3 years), 1980"
## [19] "Four years of college or higher, 1980"
## [20] "Percent of adults with less than a high school diploma, 1980"
## [21] "Percent of adults with a high school diploma only, 1980"
## [22] "Percent of adults completing some college (1-3 years), 1980"
## [23] "Percent of adults completing four years of college or higher, 1980"
## [24] "Less than a high school diploma, 1990"
## [25] "High school diploma only, 1990"
## [26] "Some college or associate's degree, 1990"
## [27] "Bachelor's degree or higher, 1990"
## [28] "Percent of adults with less than a high school diploma, 1990"
## [29] "Percent of adults with a high school diploma only, 1990"
## [30] "Percent of adults completing some college or associate's degree, 1990"
## [31] "Percent of adults with a bachelor's degree or higher, 1990"
## [32] "Less than a high school diploma, 2000"
## [33] "High school diploma only, 2000"
## [34] "Some college or associate's degree, 2000"
## [35] "Bachelor's degree or higher, 2000"
## [36] "Percent of adults with less than a high school diploma, 2000"
## [37] "Percent of adults with a high school diploma only, 2000"
## [38] "Percent of adults completing some college or associate's degree, 2000"
## [39] "Percent of adults with a bachelor's degree or higher, 2000"
## [40] "Less than a high school diploma, 2015-19"
## [41] "High school diploma only, 2015-19"
## [42] "Some college or associate's degree, 2015-19"
## [43] "Bachelor's degree or higher, 2015-19"
## [44] "Percent of adults with less than a high school diploma, 2015-19"
## [45] "Percent of adults with a high school diploma only, 2015-19"
## [46] "Percent of adults completing some college or associate's degree, 2015-19"
## [47] "Percent of adults with a bachelor's degree or higher, 2015-19"
dataset2 <- select(dataset2, -(4:32))
dataset2
## # A tibble: 3,283 x 18
## `FIPS Code` State `Area name` `High school diploma~ `Some college or associ~
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 0 US United Stat~ 52168981 49864428
## 2 1000 AL Alabama 877216 746495
## 3 1001 AL Autauga Cou~ 9332 7413
## 4 1003 AL Baldwin Cou~ 28428 28178
## 5 1005 AL Barbour Cou~ 6124 4025
## 6 1007 AL Bibb County 4838 2756
## 7 1009 AL Blount Coun~ 12136 8371
## 8 1011 AL Bullock Cou~ 2667 1325
## 9 1013 AL Butler Coun~ 4749 3146
## 10 1015 AL Calhoun Cou~ 23856 19576
## # ... with 3,273 more rows, and 13 more variables:
## # Bachelor's degree or higher, 2000 <dbl>,
## # Percent of adults with less than a high school diploma, 2000 <dbl>,
## # Percent of adults with a high school diploma only, 2000 <dbl>,
## # Percent of adults completing some college or associate's degree, 2000 <dbl>,
## # Percent of adults with a bachelor's degree or higher, 2000 <dbl>,
## # Less than a high school diploma, 2015-19 <dbl>,
## # High school diploma only, 2015-19 <dbl>,
## # Some college or associate's degree, 2015-19 <dbl>,
## # Bachelor's degree or higher, 2015-19 <dbl>,
## # Percent of adults with less than a high school diploma, 2015-19 <dbl>,
## # Percent of adults with a high school diploma only, 2015-19 <dbl>,
## # Percent of adults completing some college or associate's degree, 2015-19 <dbl>,
## # Percent of adults with a bachelor's degree or higher, 2015-19 <dbl>
names(dataset2) <- gsub(" ", "_", names(dataset2))
dataset2
## # A tibble: 3,283 x 18
## FIPS_Code State Area_name `High_school_di~ `Some_college_o~ `Bachelor's_deg~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 US United St~ 52168981 49864428 44462605
## 2 1000 AL Alabama 877216 746495 549608
## 3 1001 AL Autauga C~ 9332 7413 4972
## 4 1003 AL Baldwin C~ 28428 28178 22146
## 5 1005 AL Barbour C~ 6124 4025 2068
## 6 1007 AL Bibb Coun~ 4838 2756 962
## 7 1009 AL Blount Co~ 12136 8371 3235
## 8 1011 AL Bullock C~ 2667 1325 586
## 9 1013 AL Butler Co~ 4749 3146 1433
## 10 1015 AL Calhoun C~ 23856 19576 11265
## # ... with 3,273 more rows, and 12 more variables:
## # Percent_of_adults_with_less_than_a_high_school_diploma,_2000 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only,_2000 <dbl>,
## # Percent_of_adults_completing_some_college_or_associate's_degree,_2000 <dbl>,
## # Percent_of_adults_with_a_bachelor's_degree_or_higher,_2000 <dbl>,
## # Less_than_a_high_school_diploma,_2015-19 <dbl>,
## # High_school_diploma_only,_2015-19 <dbl>,
## # Some_college_or_associate's_degree,_2015-19 <dbl>,
## # Bachelor's_degree_or_higher,_2015-19 <dbl>,
## # Percent_of_adults_with_less_than_a_high_school_diploma,_2015-19 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only,_2015-19 <dbl>,
## # Percent_of_adults_completing_some_college_or_associate's_degree,_2015-19 <dbl>,
## # Percent_of_adults_with_a_bachelor's_degree_or_higher,_2015-19 <dbl>
names(dataset2) <- gsub(",", "", names(dataset2))
dataset2
## # A tibble: 3,283 x 18
## FIPS_Code State Area_name High_school_dip~ `Some_college_o~ `Bachelor's_deg~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 US United St~ 52168981 49864428 44462605
## 2 1000 AL Alabama 877216 746495 549608
## 3 1001 AL Autauga C~ 9332 7413 4972
## 4 1003 AL Baldwin C~ 28428 28178 22146
## 5 1005 AL Barbour C~ 6124 4025 2068
## 6 1007 AL Bibb Coun~ 4838 2756 962
## 7 1009 AL Blount Co~ 12136 8371 3235
## 8 1011 AL Bullock C~ 2667 1325 586
## 9 1013 AL Butler Co~ 4749 3146 1433
## 10 1015 AL Calhoun C~ 23856 19576 11265
## # ... with 3,273 more rows, and 12 more variables:
## # Percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # Percent_of_adults_completing_some_college_or_associate's_degree_2000 <dbl>,
## # Percent_of_adults_with_a_bachelor's_degree_or_higher_2000 <dbl>,
## # Less_than_a_high_school_diploma_2015-19 <dbl>,
## # High_school_diploma_only_2015-19 <dbl>,
## # Some_college_or_associate's_degree_2015-19 <dbl>,
## # Bachelor's_degree_or_higher_2015-19 <dbl>,
## # Percent_of_adults_with_less_than_a_high_school_diploma_2015-19 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only_2015-19 <dbl>,
## # Percent_of_adults_completing_some_college_or_associate's_degree_2015-19 <dbl>,
## # Percent_of_adults_with_a_bachelor's_degree_or_higher_2015-19 <dbl>
names(dataset2) <- gsub("'", "", names(dataset2))
dataset2
## # A tibble: 3,283 x 18
## FIPS_Code State Area_name High_school_dip~ Some_college_or~ Bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 US United St~ 52168981 49864428 44462605
## 2 1000 AL Alabama 877216 746495 549608
## 3 1001 AL Autauga C~ 9332 7413 4972
## 4 1003 AL Baldwin C~ 28428 28178 22146
## 5 1005 AL Barbour C~ 6124 4025 2068
## 6 1007 AL Bibb Coun~ 4838 2756 962
## 7 1009 AL Blount Co~ 12136 8371 3235
## 8 1011 AL Bullock C~ 2667 1325 586
## 9 1013 AL Butler Co~ 4749 3146 1433
## 10 1015 AL Calhoun C~ 23856 19576 11265
## # ... with 3,273 more rows, and 12 more variables:
## # Percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # Percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # Percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # Less_than_a_high_school_diploma_2015-19 <dbl>,
## # High_school_diploma_only_2015-19 <dbl>,
## # Some_college_or_associates_degree_2015-19 <dbl>,
## # Bachelors_degree_or_higher_2015-19 <dbl>,
## # Percent_of_adults_with_less_than_a_high_school_diploma_2015-19 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only_2015-19 <dbl>,
## # Percent_of_adults_completing_some_college_or_associates_degree_2015-19 <dbl>,
## # Percent_of_adults_with_a_bachelors_degree_or_higher_2015-19 <dbl>
names(dataset2) <- gsub("-", "_", names(dataset2))
dataset2
## # A tibble: 3,283 x 18
## FIPS_Code State Area_name High_school_dip~ Some_college_or~ Bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 US United St~ 52168981 49864428 44462605
## 2 1000 AL Alabama 877216 746495 549608
## 3 1001 AL Autauga C~ 9332 7413 4972
## 4 1003 AL Baldwin C~ 28428 28178 22146
## 5 1005 AL Barbour C~ 6124 4025 2068
## 6 1007 AL Bibb Coun~ 4838 2756 962
## 7 1009 AL Blount Co~ 12136 8371 3235
## 8 1011 AL Bullock C~ 2667 1325 586
## 9 1013 AL Butler Co~ 4749 3146 1433
## 10 1015 AL Calhoun C~ 23856 19576 11265
## # ... with 3,273 more rows, and 12 more variables:
## # Percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # Percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # Percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # Less_than_a_high_school_diploma_2015_19 <dbl>,
## # High_school_diploma_only_2015_19 <dbl>,
## # Some_college_or_associates_degree_2015_19 <dbl>,
## # Bachelors_degree_or_higher_2015_19 <dbl>,
## # Percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # Percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # Percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # Percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
names(dataset2) <- tolower(names(dataset2)) # Convert colnames to lower case
dataset2
## # A tibble: 3,283 x 18
## fips_code state area_name high_school_dip~ some_college_or~ bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 US United St~ 52168981 49864428 44462605
## 2 1000 AL Alabama 877216 746495 549608
## 3 1001 AL Autauga C~ 9332 7413 4972
## 4 1003 AL Baldwin C~ 28428 28178 22146
## 5 1005 AL Barbour C~ 6124 4025 2068
## 6 1007 AL Bibb Coun~ 4838 2756 962
## 7 1009 AL Blount Co~ 12136 8371 3235
## 8 1011 AL Bullock C~ 2667 1325 586
## 9 1013 AL Butler Co~ 4749 3146 1433
## 10 1015 AL Calhoun C~ 23856 19576 11265
## # ... with 3,273 more rows, and 12 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
dataset2$area_name <-gsub(" County","",as.character(dataset2$area_name))
dataset2
## # A tibble: 3,283 x 18
## fips_code state area_name high_school_dip~ some_college_or~ bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 US United St~ 52168981 49864428 44462605
## 2 1000 AL Alabama 877216 746495 549608
## 3 1001 AL Autauga 9332 7413 4972
## 4 1003 AL Baldwin 28428 28178 22146
## 5 1005 AL Barbour 6124 4025 2068
## 6 1007 AL Bibb 4838 2756 962
## 7 1009 AL Blount 12136 8371 3235
## 8 1011 AL Bullock 2667 1325 586
## 9 1013 AL Butler 4749 3146 1433
## 10 1015 AL Calhoun 23856 19576 11265
## # ... with 3,273 more rows, and 12 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
dataset2 <- dataset2 %>% distinct()
dataset2
## # A tibble: 3,283 x 18
## fips_code state area_name high_school_dip~ some_college_or~ bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 0 US United St~ 52168981 49864428 44462605
## 2 1000 AL Alabama 877216 746495 549608
## 3 1001 AL Autauga 9332 7413 4972
## 4 1003 AL Baldwin 28428 28178 22146
## 5 1005 AL Barbour 6124 4025 2068
## 6 1007 AL Bibb 4838 2756 962
## 7 1009 AL Blount 12136 8371 3235
## 8 1011 AL Bullock 2667 1325 586
## 9 1013 AL Butler 4749 3146 1433
## 10 1015 AL Calhoun 23856 19576 11265
## # ... with 3,273 more rows, and 12 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
dataset2 <- dataset2[-c(seq(1, 1, by=0)),]
dataset2
## # A tibble: 3,282 x 18
## fips_code state area_name high_school_dipl~ some_college_or~ bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 AL Alabama 877216 746495 549608
## 2 1001 AL Autauga 9332 7413 4972
## 3 1003 AL Baldwin 28428 28178 22146
## 4 1005 AL Barbour 6124 4025 2068
## 5 1007 AL Bibb 4838 2756 962
## 6 1009 AL Blount 12136 8371 3235
## 7 1011 AL Bullock 2667 1325 586
## 8 1013 AL Butler 4749 3146 1433
## 9 1015 AL Calhoun 23856 19576 11265
## 10 1017 AL Chambers 7863 5517 2339
## # ... with 3,272 more rows, and 12 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
dataset2[is.na(dataset2)] <- 0
dataset2
## # A tibble: 3,282 x 18
## fips_code state area_name high_school_dipl~ some_college_or~ bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 AL Alabama 877216 746495 549608
## 2 1001 AL Autauga 9332 7413 4972
## 3 1003 AL Baldwin 28428 28178 22146
## 4 1005 AL Barbour 6124 4025 2068
## 5 1007 AL Bibb 4838 2756 962
## 6 1009 AL Blount 12136 8371 3235
## 7 1011 AL Bullock 2667 1325 586
## 8 1013 AL Butler 4749 3146 1433
## 9 1015 AL Calhoun 23856 19576 11265
## 10 1017 AL Chambers 7863 5517 2339
## # ... with 3,272 more rows, and 12 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
names(dataset2)[3] <- c('county')
dataset2
## # A tibble: 3,282 x 18
## fips_code state county high_school_dipl~ some_college_or_~ bachelors_degree~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1000 AL Alabama 877216 746495 549608
## 2 1001 AL Autauga 9332 7413 4972
## 3 1003 AL Baldwin 28428 28178 22146
## 4 1005 AL Barbour 6124 4025 2068
## 5 1007 AL Bibb 4838 2756 962
## 6 1009 AL Blount 12136 8371 3235
## 7 1011 AL Bullock 2667 1325 586
## 8 1013 AL Butler 4749 3146 1433
## 9 1015 AL Calhoun 23856 19576 11265
## 10 1017 AL Chambe~ 7863 5517 2339
## # ... with 3,272 more rows, and 12 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
# sort by state and county
# sort the dataframe in R using arrange
dataset2 <- arrange(dataset2,state, county)
dataset2
## # A tibble: 3,282 x 18
## fips_code state county high_school_dipl~ some_college_or_~ bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2000 AK Alaska 105812 135655 93807
## 2 2010 AK Aleutia~ 0 0 0
## 3 2013 AK Aleutia~ 983 419 98
## 4 2016 AK Aleutia~ 1462 1407 469
## 5 2020 AK Anchora~ 38741 59428 46240
## 6 2050 AK Bethel ~ 3098 1548 1050
## 7 2060 AK Bristol~ 266 264 165
## 8 2068 AK Denali ~ 400 508 299
## 9 2070 AK Dilling~ 900 699 436
## 10 2090 AK Fairban~ 12240 18848 12968
## # ... with 3,272 more rows, and 12 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>
glimpse(dataset2)
## Rows: 3,282
## Columns: 18
## $ fips_code <dbl> ~
## $ state <chr> ~
## $ county <chr> ~
## $ high_school_diploma_only_2000 <dbl> ~
## $ some_college_or_associates_degree_2000 <dbl> ~
## $ bachelors_degree_or_higher_2000 <dbl> ~
## $ percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl> ~
## $ percent_of_adults_with_a_high_school_diploma_only_2000 <dbl> ~
## $ percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl> ~
## $ percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl> ~
## $ less_than_a_high_school_diploma_2015_19 <dbl> ~
## $ high_school_diploma_only_2015_19 <dbl> ~
## $ some_college_or_associates_degree_2015_19 <dbl> ~
## $ bachelors_degree_or_higher_2015_19 <dbl> ~
## $ percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl> ~
## $ percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl> ~
## $ percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl> ~
## $ percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl> ~
dataset3 <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_607_Project2/main/county_fips_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Fips = col_double(),
## County_Name = col_character(),
## State_Abbr = col_character(),
## State_Name = col_character()
## )
# Now tidying dataset3
head(dataset3)
## # A tibble: 6 x 4
## Fips County_Name State_Abbr State_Name
## <dbl> <chr> <chr> <chr>
## 1 1001 Autauga County AL Alabama
## 2 1003 Baldwin County AL Alabama
## 3 1005 Barbour County AL Alabama
## 4 1007 Bibb County AL Alabama
## 5 1009 Blount County AL Alabama
## 6 1011 Bullock County AL Alabama
names(dataset3) <- tolower(names(dataset3)) # Convert colnames to lower case
dataset3
## # A tibble: 3,146 x 4
## fips county_name state_abbr state_name
## <dbl> <chr> <chr> <chr>
## 1 1001 Autauga County AL Alabama
## 2 1003 Baldwin County AL Alabama
## 3 1005 Barbour County AL Alabama
## 4 1007 Bibb County AL Alabama
## 5 1009 Blount County AL Alabama
## 6 1011 Bullock County AL Alabama
## 7 1013 Butler County AL Alabama
## 8 1015 Calhoun County AL Alabama
## 9 1017 Chambers County AL Alabama
## 10 1019 Cherokee County AL Alabama
## # ... with 3,136 more rows
dataset3$county_name <-gsub(" County","",as.character(dataset3$county_name))
dataset3
## # A tibble: 3,146 x 4
## fips county_name state_abbr state_name
## <dbl> <chr> <chr> <chr>
## 1 1001 Autauga AL Alabama
## 2 1003 Baldwin AL Alabama
## 3 1005 Barbour AL Alabama
## 4 1007 Bibb AL Alabama
## 5 1009 Blount AL Alabama
## 6 1011 Bullock AL Alabama
## 7 1013 Butler AL Alabama
## 8 1015 Calhoun AL Alabama
## 9 1017 Chambers AL Alabama
## 10 1019 Cherokee AL Alabama
## # ... with 3,136 more rows
dataset3 <- rename(dataset3, county = county_name)
dataset3
## # A tibble: 3,146 x 4
## fips county state_abbr state_name
## <dbl> <chr> <chr> <chr>
## 1 1001 Autauga AL Alabama
## 2 1003 Baldwin AL Alabama
## 3 1005 Barbour AL Alabama
## 4 1007 Bibb AL Alabama
## 5 1009 Blount AL Alabama
## 6 1011 Bullock AL Alabama
## 7 1013 Butler AL Alabama
## 8 1015 Calhoun AL Alabama
## 9 1017 Chambers AL Alabama
## 10 1019 Cherokee AL Alabama
## # ... with 3,136 more rows
dataset3[is.na(dataset3)] <- 0
str(dataset3)
## spec_tbl_df [3,146 x 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ fips : num [1:3146] 1001 1003 1005 1007 1009 ...
## $ county : chr [1:3146] "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ state_abbr: chr [1:3146] "AL" "AL" "AL" "AL" ...
## $ state_name: chr [1:3146] "Alabama" "Alabama" "Alabama" "Alabama" ...
## - attr(*, "spec")=
## .. cols(
## .. Fips = col_double(),
## .. County_Name = col_character(),
## .. State_Abbr = col_character(),
## .. State_Name = col_character()
## .. )
glimpse(dataset3)
## Rows: 3,146
## Columns: 4
## $ fips <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1019,~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bullock~
## $ state_abbr <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ state_name <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala~
joint_df <- dataset2 %>% inner_join(dataset3,by="county")
joint_df
## # A tibble: 14,596 x 21
## fips_code state county high_school_dipl~ some_college_or_~ bachelors_degre~
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 AK Aleutia~ 983 419 98
## 2 2016 AK Aleutia~ 1462 1407 469
## 3 2020 AK Anchora~ 38741 59428 46240
## 4 2050 AK Bethel ~ 3098 1548 1050
## 5 2060 AK Bristol~ 266 264 165
## 6 2068 AK Denali ~ 400 508 299
## 7 2070 AK Dilling~ 900 699 436
## 8 2090 AK Fairban~ 12240 18848 12968
## 9 2100 AK Haines ~ 517 564 395
## 10 2105 AK Hoonah-~ 0 0 0
## # ... with 14,586 more rows, and 15 more variables:
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## # fips <dbl>, state_abbr <chr>, state_name <chr>
#combined_df<- merge(dataset1, joint_df, by = 'county', all.x= TRUE)
combined_df = dataset1 %>% inner_join(joint_df,by="county")
combined_df
## # A tibble: 182,340 x 29
## population county state.x median_age household_income per_capita_income
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 55380 Autauga Alabama 38.2 58731 29819
## 2 212830 Baldwin Alabama 43 58320 32626
## 3 212830 Baldwin Alabama 43 58320 32626
## 4 212830 Baldwin Alabama 43 58320 32626
## 5 212830 Baldwin Alabama 43 58320 32626
## 6 25361 Barbour Alabama 40.4 32525 18473
## 7 25361 Barbour Alabama 40.4 32525 18473
## 8 25361 Barbour Alabama 40.4 32525 18473
## 9 25361 Barbour Alabama 40.4 32525 18473
## 10 22493 Bibb Alabama 40.9 47542 20778
## # ... with 182,330 more rows, and 23 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## # state.y <chr>, high_school_diploma_only_2000 <dbl>,
## # some_college_or_associates_degree_2000 <dbl>,
## # bachelors_degree_or_higher_2000 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## # fips <dbl>, state_abbr <chr>, state_name <chr>
col_remove <- c("state.y", "fips") # Define columns that should be dropped
combined_df <- combined_df %>% # Apply select & one_of functions
select(- one_of(col_remove))
combined_df
## # A tibble: 182,340 x 27
## population county state.x median_age household_income per_capita_income
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 55380 Autauga Alabama 38.2 58731 29819
## 2 212830 Baldwin Alabama 43 58320 32626
## 3 212830 Baldwin Alabama 43 58320 32626
## 4 212830 Baldwin Alabama 43 58320 32626
## 5 212830 Baldwin Alabama 43 58320 32626
## 6 25361 Barbour Alabama 40.4 32525 18473
## 7 25361 Barbour Alabama 40.4 32525 18473
## 8 25361 Barbour Alabama 40.4 32525 18473
## 9 25361 Barbour Alabama 40.4 32525 18473
## 10 22493 Bibb Alabama 40.9 47542 20778
## # ... with 182,330 more rows, and 21 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## # high_school_diploma_only_2000 <dbl>,
## # some_college_or_associates_degree_2000 <dbl>,
## # bachelors_degree_or_higher_2000 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## # state_abbr <chr>, state_name <chr>
combined_df <- rename(combined_df, state = state.x)
combined_df
## # A tibble: 182,340 x 27
## population county state median_age household_income per_capita_income
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 55380 Autauga Alabama 38.2 58731 29819
## 2 212830 Baldwin Alabama 43 58320 32626
## 3 212830 Baldwin Alabama 43 58320 32626
## 4 212830 Baldwin Alabama 43 58320 32626
## 5 212830 Baldwin Alabama 43 58320 32626
## 6 25361 Barbour Alabama 40.4 32525 18473
## 7 25361 Barbour Alabama 40.4 32525 18473
## 8 25361 Barbour Alabama 40.4 32525 18473
## 9 25361 Barbour Alabama 40.4 32525 18473
## 10 22493 Bibb Alabama 40.9 47542 20778
## # ... with 182,330 more rows, and 21 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## # high_school_diploma_only_2000 <dbl>,
## # some_college_or_associates_degree_2000 <dbl>,
## # bachelors_degree_or_higher_2000 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## # state_abbr <chr>, state_name <chr>
combined_df <- combined_df[!duplicated(combined_df[,c('county')]),]
combined_df
## # A tibble: 1,873 x 27
## population county state median_age household_income per_capita_income
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 55380 Autauga Alabama 38.2 58731 29819
## 2 212830 Baldwin Alabama 43 58320 32626
## 3 25361 Barbour Alabama 40.4 32525 18473
## 4 22493 Bibb Alabama 40.9 47542 20778
## 5 57681 Blount Alabama 40.7 49358 24747
## 6 10248 Bullock Alabama 40.2 37785 20877
## 7 19828 Butler Alabama 40.8 40688 21038
## 8 114618 Calhoun Alabama 39.6 47255 25345
## 9 33660 Chambers Alabama 42 42289 22729
## 10 25903 Cherokee Alabama 46.5 41919 24301
## # ... with 1,863 more rows, and 21 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## # high_school_diploma_only_2000 <dbl>,
## # some_college_or_associates_degree_2000 <dbl>,
## # bachelors_degree_or_higher_2000 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## # state_abbr <chr>, state_name <chr>
summary(combined_df)
## population county state median_age
## Min. : 66 Length:1873 Length:1873 Min. :22.30
## 1st Qu.: 10709 Class :character Class :character 1st Qu.:37.70
## Median : 25903 Mode :character Mode :character Median :40.90
## Mean : 111472 Mean :41.14
## 3rd Qu.: 76042 3rd Qu.:44.40
## Max. :10081570 Max. :59.70
## household_income per_capita_income poverty_count poverty_rate
## Min. : 22346 Min. :10388 Min. : 6 Min. : 2.356
## 1st Qu.: 43661 1st Qu.:23448 1st Qu.: 1553 1st Qu.:10.403
## Median : 51931 Median :27361 Median : 3941 Median :13.900
## Mean : 53717 Mean :28164 Mean : 14575 Mean :14.790
## 3rd Qu.: 60403 3rd Qu.:31609 3rd Qu.: 10627 3rd Qu.:18.165
## Max. :142299 Max. :76592 Max. :1480446 Max. :50.780
## unemployment_rate fips_code high_school_diploma_only_2000
## Min. : 0.000 Min. : 1001 Min. : 0
## 1st Qu.: 1.789 1st Qu.:16031 1st Qu.: 2543
## Median : 2.330 Median :27077 Median : 5718
## Mean : 2.459 Mean :28143 Mean : 21544
## 3rd Qu.: 2.959 3rd Qu.:42027 3rd Qu.: 14454
## Max. :11.358 Max. :56045 Max. :3480768
## some_college_or_associates_degree_2000 bachelors_degree_or_higher_2000
## Min. : 0 Min. : 0
## 1st Qu.: 1671 1st Qu.: 869
## Median : 3897 Median : 2233
## Mean : 21034 Mean : 18737
## 3rd Qu.: 12099 3rd Qu.: 7458
## Max. :3002232 Max. :3433212
## percent_of_adults_with_less_than_a_high_school_diploma_2000
## Min. : 0.00
## 1st Qu.:15.80
## Median :20.90
## Mean :22.72
## 3rd Qu.:29.20
## Max. :65.30
## percent_of_adults_with_a_high_school_diploma_only_2000
## Min. : 0.00
## 1st Qu.:29.90
## Median :34.30
## Mean :33.82
## 3rd Qu.:38.20
## Max. :53.20
## percent_of_adults_completing_some_college_or_associates_degree_2000
## Min. : 0.00
## 1st Qu.:22.20
## Median :26.40
## Mean :26.32
## 3rd Qu.:30.30
## Max. :43.10
## percent_of_adults_with_a_bachelors_degree_or_higher_2000
## Min. : 0.00
## 1st Qu.:11.30
## Median :14.80
## Mean :16.87
## 3rd Qu.:19.90
## Max. :63.70
## less_than_a_high_school_diploma_2015_19 high_school_diploma_only_2015_19
## Min. : 4 Min. : 14
## 1st Qu.: 944 1st Qu.: 2677
## Median : 2514 Median : 6520
## Mean : 11146 Mean : 24614
## 3rd Qu.: 6196 3rd Qu.: 16694
## Max. :1796594 Max. :3541274
## some_college_or_associates_degree_2015_19 bachelors_degree_or_higher_2015_19
## Min. : 20 Min. : 0
## 1st Qu.: 2253 1st Qu.: 1192
## Median : 5423 Median : 3287
## Mean : 26738 Mean : 29893
## 3rd Qu.: 16159 3rd Qu.: 12179
## Max. :3308262 Max. :4985807
## percent_of_adults_with_less_than_a_high_school_diploma_2015_19
## Min. : 1.10
## 1st Qu.: 8.50
## Median :11.70
## Mean :13.28
## 3rd Qu.:17.10
## Max. :73.60
## percent_of_adults_with_a_high_school_diploma_only_2015_19
## Min. : 7.30
## 1st Qu.:29.00
## Median :34.00
## Mean :33.59
## 3rd Qu.:38.70
## Max. :57.40
## percent_of_adults_completing_some_college_or_associates_degree_2015_19
## Min. : 5.20
## 1st Qu.:27.30
## Median :30.90
## Mean :30.84
## 3rd Qu.:34.30
## Max. :60.60
## percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 state_abbr
## Min. : 0.0 Length:1873
## 1st Qu.:15.2 Class :character
## Median :19.7 Mode :character
## Mean :22.3
## 3rd Qu.:26.9
## Max. :77.6
## state_name
## Length:1873
## Class :character
## Mode :character
##
##
##
# Get Maximum value of the column by column name
max(combined_df$per_capita_income)
## [1] 76592
combined_df <- combined_df %>% relocate(state, .before = population)
combined_df
## # A tibble: 1,873 x 27
## state population county median_age household_income per_capita_income
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Alabama 55380 Autauga 38.2 58731 29819
## 2 Alabama 212830 Baldwin 43 58320 32626
## 3 Alabama 25361 Barbour 40.4 32525 18473
## 4 Alabama 22493 Bibb 40.9 47542 20778
## 5 Alabama 57681 Blount 40.7 49358 24747
## 6 Alabama 10248 Bullock 40.2 37785 20877
## 7 Alabama 19828 Butler 40.8 40688 21038
## 8 Alabama 114618 Calhoun 39.6 47255 25345
## 9 Alabama 33660 Chambers 42 42289 22729
## 10 Alabama 25903 Cherokee 46.5 41919 24301
## # ... with 1,863 more rows, and 21 more variables: poverty_count <dbl>,
## # poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## # high_school_diploma_only_2000 <dbl>,
## # some_college_or_associates_degree_2000 <dbl>,
## # bachelors_degree_or_higher_2000 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## # less_than_a_high_school_diploma_2015_19 <dbl>,
## # high_school_diploma_only_2015_19 <dbl>,
## # some_college_or_associates_degree_2015_19 <dbl>,
## # bachelors_degree_or_higher_2015_19 <dbl>,
## # percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## # percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## # percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## # percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## # state_abbr <chr>, state_name <chr>
by_state <- combined_df %>% group_by(state)
by_state <- by_state %>% summarise(
population = mean(population),
per_capita_income = mean(per_capita_income),
poverty_count = mean(poverty_count),
bachelors_degree = mean(bachelors_degree_or_higher_2015_19)
)
by_state
## # A tibble: 51 x 5
## state population per_capita_inco~ poverty_count bachelors_degree
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 72780. 24049. 11880. 12623.
## 2 Alaska 26208. 32964. 2741. 5050.
## 3 Arizona 470020. 24500. 69584. 92968.
## 4 Arkansas 43025. 23487. 7057. 14818.
## 5 California 694827. 33567. 91151. 158552.
## 6 Colorado 90016. 32617. 9383. 24842.
## 7 Connecticut 446884. 42480. 43018. 121933.
## 8 Delaware 319083. 33956 36467. 71379.
## 9 District of Colum~ 692683 56147 107140 289259
## 10 Florida 336279. 28025. 46125. 73899.
## # ... with 41 more rows
library(ggplot2)
ggplot(by_state, aes(fill=state, y=poverty_count, x=state)) +
geom_bar(position='dodge', stat='identity',
color = "black", width = 0.60)
### To find which state exactly has the highest poverty_count in the above barplot since the sate is not detectable easily It shows District of Columbia has the highest poverty count
by_state %>% slice(which.max(poverty_count))
## # A tibble: 1 x 5
## state population per_capita_inco~ poverty_count bachelors_degree
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 District of Columb~ 692683 56147 107140 289259
library(ggplot2)
ggplot(by_state, aes(fill=state, y=bachelors_degree, x=state)) +
geom_bar(position='dodge', stat='identity',
color = "black", width = 0.60)
### To find which state exactly has the highest education average in the above barplot since the sate is not detectable easily It shows District of Columbia also has the highest bachelors_degree average
by_state %>% slice(which.max(bachelors_degree))
## # A tibble: 1 x 5
## state population per_capita_inco~ poverty_count bachelors_degree
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 District of Columb~ 692683 56147 107140 289259
# We can that
by_state %>%
ggplot(aes(x=poverty_count,
y=bachelors_degree,
color=state))+
geom_point()+
geom_smooth(method="lm",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
ggsave("add_regression_line_per_state_to_scatterplot_ggplot2.png")
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'