Loading Libraries:

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.2

## Warning: package 'stringr' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidyr)
library(dplyr)
library(ggplot2)

Introduction:

There are three datasets in this file: 1- Census data 2- Economic Research Service -U.S.Department of Agriculture ERS-USDA data 3- FIPS Code data In this assignment, my aim is to perform data cleaning on all three data set, after initial analysis,merge thre three datasets on FIPS Code column and then find the co-relation between the variables and perform final analysis

Dataset1:

Reading census data from GitHub

library(readr)

dataset1 <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_607_Project2/main/census_us_county_data.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   County = col_character(),
##   Name = col_character(),
##   Population = col_double(),
##   `Median Age` = col_double(),
##   `Household Income` = col_double(),
##   `Per Capita Income` = col_double(),
##   `Poverty Count` = col_double(),
##   `Poverty Rate` = col_double(),
##   `Unemployment Rate` = col_double()
## )

Cleaning/Transforming census_us_county_data, dataset1:

head(dataset1)

## # A tibble: 6 x 9
##   County Name         Population `Median Age` `Household Inco~ `Per Capita Inco~
##   <chr>  <chr>             <dbl>        <dbl>            <dbl>             <dbl>
## 1 051    Fayette Cou~      21565         41.9            46650             23194
## 2 107    Logan Count~      29003         40.1            57308             27546
## 3 165    Saline Coun~      23994         42.2            44090             25342
## 4 097    Lake County~     701473         38.4            89427             45766
## 5 127    Massac Coun~      14219         43.5            47481             23539
## 6 017    Cass County~      12493         40              52373             26992
## # ... with 3 more variables: Poverty Count <dbl>, Poverty Rate <dbl>,
## #   Unemployment Rate <dbl>

Make the column names lowercase

names(dataset1) <- tolower(names(dataset1))    # Convert colnames to lower case
dataset1

## # A tibble: 3,220 x 9
##    county name        population `median age` `household incom~ `per capita inc~
##    <chr>  <chr>            <dbl>        <dbl>             <dbl>            <dbl>
##  1 051    Fayette Co~      21565         41.9             46650            23194
##  2 107    Logan Coun~      29003         40.1             57308            27546
##  3 165    Saline Cou~      23994         42.2             44090            25342
##  4 097    Lake Count~     701473         38.4             89427            45766
##  5 127    Massac Cou~      14219         43.5             47481            23539
##  6 017    Cass Count~      12493         40               52373            26992
##  7 069    Huntington~      36359         40.7             53632            26502
##  8 181    White Coun~      24149         42               54576            27461
##  9 075    Jay County~      20840         39.7             47658            23443
## 10 145    Shelby Cou~      44438         40.8             60404            29583
## # ... with 3,210 more rows, and 3 more variables: poverty count <dbl>,
## #   poverty rate <dbl>, unemployment rate <dbl>

Seperating a column into two columns dataset1

dataset1 <- dataset1 %>% separate(name, c("county", "state"), ",")

dataset1

## # A tibble: 3,220 x 9
##    population county     state   `median age` `household incom~ `per capita inc~
##         <dbl> <chr>      <chr>          <dbl>             <dbl>            <dbl>
##  1      21565 Fayette C~ " Illi~         41.9             46650            23194
##  2      29003 Logan Cou~ " Illi~         40.1             57308            27546
##  3      23994 Saline Co~ " Illi~         42.2             44090            25342
##  4     701473 Lake Coun~ " Illi~         38.4             89427            45766
##  5      14219 Massac Co~ " Illi~         43.5             47481            23539
##  6      12493 Cass Coun~ " Illi~         40               52373            26992
##  7      36359 Huntingto~ " Indi~         40.7             53632            26502
##  8      24149 White Cou~ " Indi~         42               54576            27461
##  9      20840 Jay County " Indi~         39.7             47658            23443
## 10      44438 Shelby Co~ " Indi~         40.8             60404            29583
## # ... with 3,210 more rows, and 3 more variables: poverty count <dbl>,
## #   poverty rate <dbl>, unemployment rate <dbl>

Dropping duplicates

dataset1 <- dataset1 %>% distinct()

dataset1

## # A tibble: 3,220 x 9
##    population county     state   `median age` `household incom~ `per capita inc~
##         <dbl> <chr>      <chr>          <dbl>             <dbl>            <dbl>
##  1      21565 Fayette C~ " Illi~         41.9             46650            23194
##  2      29003 Logan Cou~ " Illi~         40.1             57308            27546
##  3      23994 Saline Co~ " Illi~         42.2             44090            25342
##  4     701473 Lake Coun~ " Illi~         38.4             89427            45766
##  5      14219 Massac Co~ " Illi~         43.5             47481            23539
##  6      12493 Cass Coun~ " Illi~         40               52373            26992
##  7      36359 Huntingto~ " Indi~         40.7             53632            26502
##  8      24149 White Cou~ " Indi~         42               54576            27461
##  9      20840 Jay County " Indi~         39.7             47658            23443
## 10      44438 Shelby Co~ " Indi~         40.8             60404            29583
## # ... with 3,210 more rows, and 3 more variables: poverty count <dbl>,
## #   poverty rate <dbl>, unemployment rate <dbl>

Renaming the columns

names(dataset1)[4:9] <- c('median_age', 'household_income', 'per_capita_income', 'poverty_count',
                     'poverty_rate', 'unemployment_rate')

dataset1

## # A tibble: 3,220 x 9
##    population county       state    median_age household_income per_capita_inco~
##         <dbl> <chr>        <chr>         <dbl>            <dbl>            <dbl>
##  1      21565 Fayette Cou~ " Illin~       41.9            46650            23194
##  2      29003 Logan County " Illin~       40.1            57308            27546
##  3      23994 Saline Coun~ " Illin~       42.2            44090            25342
##  4     701473 Lake County  " Illin~       38.4            89427            45766
##  5      14219 Massac Coun~ " Illin~       43.5            47481            23539
##  6      12493 Cass County  " Illin~       40              52373            26992
##  7      36359 Huntington ~ " India~       40.7            53632            26502
##  8      24149 White County " India~       42              54576            27461
##  9      20840 Jay County   " India~       39.7            47658            23443
## 10      44438 Shelby Coun~ " India~       40.8            60404            29583
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>

Removing the string ‘county’

dataset1$county <-gsub(" County","",as.character(dataset1$county))

dataset1

## # A tibble: 3,220 x 9
##    population county     state      median_age household_income per_capita_inco~
##         <dbl> <chr>      <chr>           <dbl>            <dbl>            <dbl>
##  1      21565 Fayette    " Illinoi~       41.9            46650            23194
##  2      29003 Logan      " Illinoi~       40.1            57308            27546
##  3      23994 Saline     " Illinoi~       42.2            44090            25342
##  4     701473 Lake       " Illinoi~       38.4            89427            45766
##  5      14219 Massac     " Illinoi~       43.5            47481            23539
##  6      12493 Cass       " Illinoi~       40              52373            26992
##  7      36359 Huntington " Indiana"       40.7            53632            26502
##  8      24149 White      " Indiana"       42              54576            27461
##  9      20840 Jay        " Indiana"       39.7            47658            23443
## 10      44438 Shelby     " Indiana"       40.8            60404            29583
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>

glimpse(dataset1)

## Rows: 3,220
## Columns: 9
## $ population        <dbl> 21565, 29003, 23994, 701473, 14219, 12493, 36359, 24~
## $ county            <chr> "Fayette", "Logan", "Saline", "Lake", "Massac", "Cas~
## $ state             <chr> " Illinois", " Illinois", " Illinois", " Illinois", ~
## $ median_age        <dbl> 41.9, 40.1, 42.2, 38.4, 43.5, 40.0, 40.7, 42.0, 39.7~
## $ household_income  <dbl> 46650, 57308, 44090, 89427, 47481, 52373, 53632, 545~
## $ per_capita_income <dbl> 23194, 27546, 25342, 45766, 23539, 26992, 26502, 274~
## $ poverty_count     <dbl> 3421, 2323, 4936, 54273, 2331, 1822, 3809, 2198, 288~
## $ poverty_rate      <dbl> 15.863668, 8.009516, 20.571810, 7.737005, 16.393558,~
## $ unemployment_rate <dbl> 2.4345003, 2.5445644, 3.4008502, 2.7594790, 1.821506~

str(dataset1)

## tibble [3,220 x 9] (S3: tbl_df/tbl/data.frame)
##  $ population       : num [1:3220] 21565 29003 23994 701473 14219 ...
##  $ county           : chr [1:3220] "Fayette" "Logan" "Saline" "Lake" ...
##  $ state            : chr [1:3220] " Illinois" " Illinois" " Illinois" " Illinois" ...
##  $ median_age       : num [1:3220] 41.9 40.1 42.2 38.4 43.5 40 40.7 42 39.7 40.8 ...
##  $ household_income : num [1:3220] 46650 57308 44090 89427 47481 ...
##  $ per_capita_income: num [1:3220] 23194 27546 25342 45766 23539 ...
##  $ poverty_count    : num [1:3220] 3421 2323 4936 54273 2331 ...
##  $ poverty_rate     : num [1:3220] 15.86 8.01 20.57 7.74 16.39 ...
##  $ unemployment_rate: num [1:3220] 2.43 2.54 3.4 2.76 1.82 ...

dataset1$state <- trimws(dataset1$state, which = c("left"))

str(dataset1)

## tibble [3,220 x 9] (S3: tbl_df/tbl/data.frame)
##  $ population       : num [1:3220] 21565 29003 23994 701473 14219 ...
##  $ county           : chr [1:3220] "Fayette" "Logan" "Saline" "Lake" ...
##  $ state            : chr [1:3220] "Illinois" "Illinois" "Illinois" "Illinois" ...
##  $ median_age       : num [1:3220] 41.9 40.1 42.2 38.4 43.5 40 40.7 42 39.7 40.8 ...
##  $ household_income : num [1:3220] 46650 57308 44090 89427 47481 ...
##  $ per_capita_income: num [1:3220] 23194 27546 25342 45766 23539 ...
##  $ poverty_count    : num [1:3220] 3421 2323 4936 54273 2331 ...
##  $ poverty_rate     : num [1:3220] 15.86 8.01 20.57 7.74 16.39 ...
##  $ unemployment_rate: num [1:3220] 2.43 2.54 3.4 2.76 1.82 ...

Sorting by columns

# sort by state and county
# sort the dataframe in R using arrange

dataset1 <- arrange(dataset1,state, county)

dataset1

## # A tibble: 3,220 x 9
##    population county   state   median_age household_income per_capita_income
##         <dbl> <chr>    <chr>        <dbl>            <dbl>             <dbl>
##  1      55380 Autauga  Alabama       38.2            58731             29819
##  2     212830 Baldwin  Alabama       43              58320             32626
##  3      25361 Barbour  Alabama       40.4            32525             18473
##  4      22493 Bibb     Alabama       40.9            47542             20778
##  5      57681 Blount   Alabama       40.7            49358             24747
##  6      10248 Bullock  Alabama       40.2            37785             20877
##  7      19828 Butler   Alabama       40.8            40688             21038
##  8     114618 Calhoun  Alabama       39.6            47255             25345
##  9      33660 Chambers Alabama       42              42289             22729
## 10      25903 Cherokee Alabama       46.5            41919             24301
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>

fill NA with 0 in dataset1

dataset1[is.na(dataset1)] <- 0

dataset1

## # A tibble: 3,220 x 9
##    population county   state   median_age household_income per_capita_income
##         <dbl> <chr>    <chr>        <dbl>            <dbl>             <dbl>
##  1      55380 Autauga  Alabama       38.2            58731             29819
##  2     212830 Baldwin  Alabama       43              58320             32626
##  3      25361 Barbour  Alabama       40.4            32525             18473
##  4      22493 Bibb     Alabama       40.9            47542             20778
##  5      57681 Blount   Alabama       40.7            49358             24747
##  6      10248 Bullock  Alabama       40.2            37785             20877
##  7      19828 Butler   Alabama       40.8            40688             21038
##  8     114618 Calhoun  Alabama       39.6            47255             25345
##  9      33660 Chambers Alabama       42              42289             22729
## 10      25903 Cherokee Alabama       46.5            41919             24301
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>

Analysis:

Step 1: Select data frame Step 2: Group data Step 3: Summarize the data Step 4: Plot the summary statistics

library(ggplot2)
# Step 1
dataset1 %>% 
#Step 2
group_by(state)

## # A tibble: 3,220 x 9
## # Groups:   state [52]
##    population county   state   median_age household_income per_capita_income
##         <dbl> <chr>    <chr>        <dbl>            <dbl>             <dbl>
##  1      55380 Autauga  Alabama       38.2            58731             29819
##  2     212830 Baldwin  Alabama       43              58320             32626
##  3      25361 Barbour  Alabama       40.4            32525             18473
##  4      22493 Bibb     Alabama       40.9            47542             20778
##  5      57681 Blount   Alabama       40.7            49358             24747
##  6      10248 Bullock  Alabama       40.2            37785             20877
##  7      19828 Butler   Alabama       40.8            40688             21038
##  8     114618 Calhoun  Alabama       39.6            47255             25345
##  9      33660 Chambers Alabama       42              42289             22729
## 10      25903 Cherokee Alabama       46.5            41919             24301
## # ... with 3,210 more rows, and 3 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>

Multiple columns data by bar plot:

# First filter the state Alabama and selecting the columns 

dataset1_alabama <- dataset1 %>% filter(state == "Alabama") %>%

  select(c(3:9, 1))

dataset1_alabama

## # A tibble: 67 x 8
##    state median_age household_income per_capita_inco~ poverty_count poverty_rate
##    <chr>      <dbl>            <dbl>            <dbl>         <dbl>        <dbl>
##  1 Alab~       38.2            58731            29819          8340         15.1
##  2 Alab~       43              58320            32626         21704         10.2
##  3 Alab~       40.4            32525            18473          6875         27.1
##  4 Alab~       40.9            47542            20778          3740         16.6
##  5 Alab~       40.7            49358            24747          7739         13.4
##  6 Alab~       40.2            37785            20877          2825         27.6
##  7 Alab~       40.8            40688            21038          4397         22.2
##  8 Alab~       39.6            47255            25345         19969         17.4
##  9 Alab~       42              42289            22729          5711         17.0
## 10 Alab~       46.5            41919            24301          3560         13.7
## # ... with 57 more rows, and 2 more variables: unemployment_rate <dbl>,
## #   population <dbl>

str(dataset1_alabama)

## tibble [67 x 8] (S3: tbl_df/tbl/data.frame)
##  $ state            : chr [1:67] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ median_age       : num [1:67] 38.2 43 40.4 40.9 40.7 40.2 40.8 39.6 42 46.5 ...
##  $ household_income : num [1:67] 58731 58320 32525 47542 49358 ...
##  $ per_capita_income: num [1:67] 29819 32626 18473 20778 24747 ...
##  $ poverty_count    : num [1:67] 8340 21704 6875 3740 7739 ...
##  $ poverty_rate     : num [1:67] 15.1 10.2 27.1 16.6 13.4 ...
##  $ unemployment_rate: num [1:67] 1.69 1.99 3.35 2.93 1.32 ...
##  $ population       : num [1:67] 55380 212830 25361 22493 57681 ...

Transform the table from wide form to long form

data1_al <- dataset1_alabama %>%

  pivot_longer(cols = c(2:8),

               names_to = "Variable",

               values_to = "Value")

data1_al

## # A tibble: 469 x 3
##    state   Variable             Value
##    <chr>   <chr>                <dbl>
##  1 Alabama median_age           38.2 
##  2 Alabama household_income  58731   
##  3 Alabama per_capita_income 29819   
##  4 Alabama poverty_count      8340   
##  5 Alabama poverty_rate         15.1 
##  6 Alabama unemployment_rate     1.69
##  7 Alabama population        55380   
##  8 Alabama median_age           43   
##  9 Alabama household_income  58320   
## 10 Alabama per_capita_income 32626   
## # ... with 459 more rows

bar_plot <- ggplot(data = data1_al, aes(x = Variable, y = Value, fill = Variable)) +

  geom_bar(stat = "identity") + coord_flip() +
  
  labs(
        x = "Variables",
        y = "Values",
        title = paste(
            "Census data for Alabama"
        )
)
bar_plot

Dataset 2:

Reading the dataset 2 from GitHub

dataset2 <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_607_Project2/main/ers_usda_education_data.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   State = col_character(),
##   `Area name` = col_character(),
##   `Less than a high school diploma, 1970` = col_number(),
##   `High school diploma only, 1970` = col_number(),
##   `Some college (1-3 years), 1970` = col_number(),
##   `Four years of college or higher, 1970` = col_number(),
##   `Less than a high school diploma, 1980` = col_number(),
##   `High school diploma only, 1980` = col_number(),
##   `Some college (1-3 years), 1980` = col_number(),
##   `Four years of college or higher, 1980` = col_number(),
##   `Less than a high school diploma, 1990` = col_number(),
##   `High school diploma only, 1990` = col_number(),
##   `Some college or associate's degree, 1990` = col_number(),
##   `Bachelor's degree or higher, 1990` = col_number(),
##   `Less than a high school diploma, 2000` = col_number(),
##   `High school diploma only, 2000` = col_number(),
##   `Some college or associate's degree, 2000` = col_number(),
##   `Bachelor's degree or higher, 2000` = col_number(),
##   `Less than a high school diploma, 2015-19` = col_number(),
##   `High school diploma only, 2015-19` = col_number()
##   # ... with 2 more columns
## )
## i Use `spec()` for the full column specifications.

Cleaning/Transforming ers_usda_education_data, dataset2:

head(dataset2)

## # A tibble: 6 x 47
##   `FIPS Code` State `Area name`   `2003 Rural-urban Conti~ `2003 Urban Influenc~
##         <dbl> <chr> <chr>                            <dbl>                 <dbl>
## 1           0 US    United States                       NA                    NA
## 2        1000 AL    Alabama                             NA                    NA
## 3        1001 AL    Autauga Coun~                        2                     2
## 4        1003 AL    Baldwin Coun~                        4                     5
## 5        1005 AL    Barbour Coun~                        6                     6
## 6        1007 AL    Bibb County                          1                     1
## # ... with 42 more variables: 2013 Rural-urban Continuum Code <dbl>,
## #   2013 Urban Influence Code <dbl>,
## #   Less than a high school diploma, 1970 <dbl>,
## #   High school diploma only, 1970 <dbl>, Some college (1-3 years), 1970 <dbl>,
## #   Four years of college or higher, 1970 <dbl>,
## #   Percent of adults with less than a high school diploma, 1970 <dbl>,
## #   Percent of adults with a high school diploma only, 1970 <dbl>,
## #   Percent of adults completing some college (1-3 years), 1970 <dbl>,
## #   Percent of adults completing four years of college or higher, 1970 <dbl>,
## #   Less than a high school diploma, 1980 <dbl>,
## #   High school diploma only, 1980 <dbl>, Some college (1-3 years), 1980 <dbl>,
## #   Four years of college or higher, 1980 <dbl>,
## #   Percent of adults with less than a high school diploma, 1980 <dbl>,
## #   Percent of adults with a high school diploma only, 1980 <dbl>,
## #   Percent of adults completing some college (1-3 years), 1980 <dbl>,
## #   Percent of adults completing four years of college or higher, 1980 <dbl>,
## #   Less than a high school diploma, 1990 <dbl>,
## #   High school diploma only, 1990 <dbl>,
## #   Some college or associate's degree, 1990 <dbl>,
## #   Bachelor's degree or higher, 1990 <dbl>,
## #   Percent of adults with less than a high school diploma, 1990 <dbl>,
## #   Percent of adults with a high school diploma only, 1990 <dbl>,
## #   Percent of adults completing some college or associate's degree, 1990 <dbl>,
## #   Percent of adults with a bachelor's degree or higher, 1990 <dbl>,
## #   Less than a high school diploma, 2000 <dbl>,
## #   High school diploma only, 2000 <dbl>,
## #   Some college or associate's degree, 2000 <dbl>,
## #   Bachelor's degree or higher, 2000 <dbl>,
## #   Percent of adults with less than a high school diploma, 2000 <dbl>,
## #   Percent of adults with a high school diploma only, 2000 <dbl>,
## #   Percent of adults completing some college or associate's degree, 2000 <dbl>,
## #   Percent of adults with a bachelor's degree or higher, 2000 <dbl>,
## #   Less than a high school diploma, 2015-19 <dbl>,
## #   High school diploma only, 2015-19 <dbl>,
## #   Some college or associate's degree, 2015-19 <dbl>,
## #   Bachelor's degree or higher, 2015-19 <dbl>,
## #   Percent of adults with less than a high school diploma, 2015-19 <dbl>,
## #   Percent of adults with a high school diploma only, 2015-19 <dbl>,
## #   Percent of adults completing some college or associate's degree, 2015-19 <dbl>,
## #   Percent of adults with a bachelor's degree or higher, 2015-19 <dbl>

colnames(dataset2)

##  [1] "FIPS Code"                                                               
##  [2] "State"                                                                   
##  [3] "Area name"                                                               
##  [4] "2003 Rural-urban Continuum Code"                                         
##  [5] "2003 Urban Influence Code"                                               
##  [6] "2013 Rural-urban Continuum Code"                                         
##  [7] "2013 Urban Influence Code"                                               
##  [8] "Less than a high school diploma, 1970"                                   
##  [9] "High school diploma only, 1970"                                          
## [10] "Some college (1-3 years), 1970"                                          
## [11] "Four years of college or higher, 1970"                                   
## [12] "Percent of adults with less than a high school diploma, 1970"            
## [13] "Percent of adults with a high school diploma only, 1970"                 
## [14] "Percent of adults completing some college (1-3 years), 1970"             
## [15] "Percent of adults completing four years of college or higher, 1970"      
## [16] "Less than a high school diploma, 1980"                                   
## [17] "High school diploma only, 1980"                                          
## [18] "Some college (1-3 years), 1980"                                          
## [19] "Four years of college or higher, 1980"                                   
## [20] "Percent of adults with less than a high school diploma, 1980"            
## [21] "Percent of adults with a high school diploma only, 1980"                 
## [22] "Percent of adults completing some college (1-3 years), 1980"             
## [23] "Percent of adults completing four years of college or higher, 1980"      
## [24] "Less than a high school diploma, 1990"                                   
## [25] "High school diploma only, 1990"                                          
## [26] "Some college or associate's degree, 1990"                                
## [27] "Bachelor's degree or higher, 1990"                                       
## [28] "Percent of adults with less than a high school diploma, 1990"            
## [29] "Percent of adults with a high school diploma only, 1990"                 
## [30] "Percent of adults completing some college or associate's degree, 1990"   
## [31] "Percent of adults with a bachelor's degree or higher, 1990"              
## [32] "Less than a high school diploma, 2000"                                   
## [33] "High school diploma only, 2000"                                          
## [34] "Some college or associate's degree, 2000"                                
## [35] "Bachelor's degree or higher, 2000"                                       
## [36] "Percent of adults with less than a high school diploma, 2000"            
## [37] "Percent of adults with a high school diploma only, 2000"                 
## [38] "Percent of adults completing some college or associate's degree, 2000"   
## [39] "Percent of adults with a bachelor's degree or higher, 2000"              
## [40] "Less than a high school diploma, 2015-19"                                
## [41] "High school diploma only, 2015-19"                                       
## [42] "Some college or associate's degree, 2015-19"                             
## [43] "Bachelor's degree or higher, 2015-19"                                    
## [44] "Percent of adults with less than a high school diploma, 2015-19"         
## [45] "Percent of adults with a high school diploma only, 2015-19"              
## [46] "Percent of adults completing some college or associate's degree, 2015-19"
## [47] "Percent of adults with a bachelor's degree or higher, 2015-19"

Dropping 4:32 columns

dataset2 <- select(dataset2, -(4:32))
                           

dataset2

## # A tibble: 3,283 x 18
##    `FIPS Code` State `Area name`  `High school diploma~ `Some college or associ~
##          <dbl> <chr> <chr>                        <dbl>                    <dbl>
##  1           0 US    United Stat~              52168981                 49864428
##  2        1000 AL    Alabama                     877216                   746495
##  3        1001 AL    Autauga Cou~                  9332                     7413
##  4        1003 AL    Baldwin Cou~                 28428                    28178
##  5        1005 AL    Barbour Cou~                  6124                     4025
##  6        1007 AL    Bibb County                   4838                     2756
##  7        1009 AL    Blount Coun~                 12136                     8371
##  8        1011 AL    Bullock Cou~                  2667                     1325
##  9        1013 AL    Butler Coun~                  4749                     3146
## 10        1015 AL    Calhoun Cou~                 23856                    19576
## # ... with 3,273 more rows, and 13 more variables:
## #   Bachelor's degree or higher, 2000 <dbl>,
## #   Percent of adults with less than a high school diploma, 2000 <dbl>,
## #   Percent of adults with a high school diploma only, 2000 <dbl>,
## #   Percent of adults completing some college or associate's degree, 2000 <dbl>,
## #   Percent of adults with a bachelor's degree or higher, 2000 <dbl>,
## #   Less than a high school diploma, 2015-19 <dbl>,
## #   High school diploma only, 2015-19 <dbl>,
## #   Some college or associate's degree, 2015-19 <dbl>,
## #   Bachelor's degree or higher, 2015-19 <dbl>,
## #   Percent of adults with less than a high school diploma, 2015-19 <dbl>,
## #   Percent of adults with a high school diploma only, 2015-19 <dbl>,
## #   Percent of adults completing some college or associate's degree, 2015-19 <dbl>,
## #   Percent of adults with a bachelor's degree or higher, 2015-19 <dbl>

Rename the columns by removing space with underscore

names(dataset2) <- gsub(" ", "_", names(dataset2))

dataset2

## # A tibble: 3,283 x 18
##    FIPS_Code State Area_name  `High_school_di~ `Some_college_o~ `Bachelor's_deg~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1         0 US    United St~         52168981         49864428         44462605
##  2      1000 AL    Alabama              877216           746495           549608
##  3      1001 AL    Autauga C~             9332             7413             4972
##  4      1003 AL    Baldwin C~            28428            28178            22146
##  5      1005 AL    Barbour C~             6124             4025             2068
##  6      1007 AL    Bibb Coun~             4838             2756              962
##  7      1009 AL    Blount Co~            12136             8371             3235
##  8      1011 AL    Bullock C~             2667             1325              586
##  9      1013 AL    Butler Co~             4749             3146             1433
## 10      1015 AL    Calhoun C~            23856            19576            11265
## # ... with 3,273 more rows, and 12 more variables:
## #   Percent_of_adults_with_less_than_a_high_school_diploma,_2000 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only,_2000 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associate's_degree,_2000 <dbl>,
## #   Percent_of_adults_with_a_bachelor's_degree_or_higher,_2000 <dbl>,
## #   Less_than_a_high_school_diploma,_2015-19 <dbl>,
## #   High_school_diploma_only,_2015-19 <dbl>,
## #   Some_college_or_associate's_degree,_2015-19 <dbl>,
## #   Bachelor's_degree_or_higher,_2015-19 <dbl>,
## #   Percent_of_adults_with_less_than_a_high_school_diploma,_2015-19 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only,_2015-19 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associate's_degree,_2015-19 <dbl>,
## #   Percent_of_adults_with_a_bachelor's_degree_or_higher,_2015-19 <dbl>

Removing comma from column names

names(dataset2) <- gsub(",", "", names(dataset2))

dataset2

## # A tibble: 3,283 x 18
##    FIPS_Code State Area_name  High_school_dip~ `Some_college_o~ `Bachelor's_deg~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1         0 US    United St~         52168981         49864428         44462605
##  2      1000 AL    Alabama              877216           746495           549608
##  3      1001 AL    Autauga C~             9332             7413             4972
##  4      1003 AL    Baldwin C~            28428            28178            22146
##  5      1005 AL    Barbour C~             6124             4025             2068
##  6      1007 AL    Bibb Coun~             4838             2756              962
##  7      1009 AL    Blount Co~            12136             8371             3235
##  8      1011 AL    Bullock C~             2667             1325              586
##  9      1013 AL    Butler Co~             4749             3146             1433
## 10      1015 AL    Calhoun C~            23856            19576            11265
## # ... with 3,273 more rows, and 12 more variables:
## #   Percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associate's_degree_2000 <dbl>,
## #   Percent_of_adults_with_a_bachelor's_degree_or_higher_2000 <dbl>,
## #   Less_than_a_high_school_diploma_2015-19 <dbl>,
## #   High_school_diploma_only_2015-19 <dbl>,
## #   Some_college_or_associate's_degree_2015-19 <dbl>,
## #   Bachelor's_degree_or_higher_2015-19 <dbl>,
## #   Percent_of_adults_with_less_than_a_high_school_diploma_2015-19 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only_2015-19 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associate's_degree_2015-19 <dbl>,
## #   Percent_of_adults_with_a_bachelor's_degree_or_higher_2015-19 <dbl>

Removing apostrophy from column names

names(dataset2) <- gsub("'", "", names(dataset2))

dataset2

## # A tibble: 3,283 x 18
##    FIPS_Code State Area_name  High_school_dip~ Some_college_or~ Bachelors_degre~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1         0 US    United St~         52168981         49864428         44462605
##  2      1000 AL    Alabama              877216           746495           549608
##  3      1001 AL    Autauga C~             9332             7413             4972
##  4      1003 AL    Baldwin C~            28428            28178            22146
##  5      1005 AL    Barbour C~             6124             4025             2068
##  6      1007 AL    Bibb Coun~             4838             2756              962
##  7      1009 AL    Blount Co~            12136             8371             3235
##  8      1011 AL    Bullock C~             2667             1325              586
##  9      1013 AL    Butler Co~             4749             3146             1433
## 10      1015 AL    Calhoun C~            23856            19576            11265
## # ... with 3,273 more rows, and 12 more variables:
## #   Percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   Percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   Less_than_a_high_school_diploma_2015-19 <dbl>,
## #   High_school_diploma_only_2015-19 <dbl>,
## #   Some_college_or_associates_degree_2015-19 <dbl>,
## #   Bachelors_degree_or_higher_2015-19 <dbl>,
## #   Percent_of_adults_with_less_than_a_high_school_diploma_2015-19 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only_2015-19 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associates_degree_2015-19 <dbl>,
## #   Percent_of_adults_with_a_bachelors_degree_or_higher_2015-19 <dbl>

Replacing dash with underscore in column names

names(dataset2) <- gsub("-", "_", names(dataset2))

dataset2

## # A tibble: 3,283 x 18
##    FIPS_Code State Area_name  High_school_dip~ Some_college_or~ Bachelors_degre~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1         0 US    United St~         52168981         49864428         44462605
##  2      1000 AL    Alabama              877216           746495           549608
##  3      1001 AL    Autauga C~             9332             7413             4972
##  4      1003 AL    Baldwin C~            28428            28178            22146
##  5      1005 AL    Barbour C~             6124             4025             2068
##  6      1007 AL    Bibb Coun~             4838             2756              962
##  7      1009 AL    Blount Co~            12136             8371             3235
##  8      1011 AL    Bullock C~             2667             1325              586
##  9      1013 AL    Butler Co~             4749             3146             1433
## 10      1015 AL    Calhoun C~            23856            19576            11265
## # ... with 3,273 more rows, and 12 more variables:
## #   Percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   Percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   Less_than_a_high_school_diploma_2015_19 <dbl>,
## #   High_school_diploma_only_2015_19 <dbl>,
## #   Some_college_or_associates_degree_2015_19 <dbl>,
## #   Bachelors_degree_or_higher_2015_19 <dbl>,
## #   Percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   Percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   Percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   Percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

Making column names to lowercase

names(dataset2) <- tolower(names(dataset2))    # Convert colnames to lower case
dataset2

## # A tibble: 3,283 x 18
##    fips_code state area_name  high_school_dip~ some_college_or~ bachelors_degre~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1         0 US    United St~         52168981         49864428         44462605
##  2      1000 AL    Alabama              877216           746495           549608
##  3      1001 AL    Autauga C~             9332             7413             4972
##  4      1003 AL    Baldwin C~            28428            28178            22146
##  5      1005 AL    Barbour C~             6124             4025             2068
##  6      1007 AL    Bibb Coun~             4838             2756              962
##  7      1009 AL    Blount Co~            12136             8371             3235
##  8      1011 AL    Bullock C~             2667             1325              586
##  9      1013 AL    Butler Co~             4749             3146             1433
## 10      1015 AL    Calhoun C~            23856            19576            11265
## # ... with 3,273 more rows, and 12 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

Removing the string ‘county’ from ‘area_name’

dataset2$area_name <-gsub(" County","",as.character(dataset2$area_name))

dataset2

## # A tibble: 3,283 x 18
##    fips_code state area_name  high_school_dip~ some_college_or~ bachelors_degre~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1         0 US    United St~         52168981         49864428         44462605
##  2      1000 AL    Alabama              877216           746495           549608
##  3      1001 AL    Autauga                9332             7413             4972
##  4      1003 AL    Baldwin               28428            28178            22146
##  5      1005 AL    Barbour                6124             4025             2068
##  6      1007 AL    Bibb                   4838             2756              962
##  7      1009 AL    Blount                12136             8371             3235
##  8      1011 AL    Bullock                2667             1325              586
##  9      1013 AL    Butler                 4749             3146             1433
## 10      1015 AL    Calhoun               23856            19576            11265
## # ... with 3,273 more rows, and 12 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

Dropping the duplicate rows

dataset2 <- dataset2 %>% distinct()

dataset2

## # A tibble: 3,283 x 18
##    fips_code state area_name  high_school_dip~ some_college_or~ bachelors_degre~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1         0 US    United St~         52168981         49864428         44462605
##  2      1000 AL    Alabama              877216           746495           549608
##  3      1001 AL    Autauga                9332             7413             4972
##  4      1003 AL    Baldwin               28428            28178            22146
##  5      1005 AL    Barbour                6124             4025             2068
##  6      1007 AL    Bibb                   4838             2756              962
##  7      1009 AL    Blount                12136             8371             3235
##  8      1011 AL    Bullock                2667             1325              586
##  9      1013 AL    Butler                 4749             3146             1433
## 10      1015 AL    Calhoun               23856            19576            11265
## # ... with 3,273 more rows, and 12 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

Drop first row

dataset2 <- dataset2[-c(seq(1, 1, by=0)),]

dataset2

## # A tibble: 3,282 x 18
##    fips_code state area_name high_school_dipl~ some_college_or~ bachelors_degre~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1      1000 AL    Alabama              877216           746495           549608
##  2      1001 AL    Autauga                9332             7413             4972
##  3      1003 AL    Baldwin               28428            28178            22146
##  4      1005 AL    Barbour                6124             4025             2068
##  5      1007 AL    Bibb                   4838             2756              962
##  6      1009 AL    Blount                12136             8371             3235
##  7      1011 AL    Bullock                2667             1325              586
##  8      1013 AL    Butler                 4749             3146             1433
##  9      1015 AL    Calhoun               23856            19576            11265
## 10      1017 AL    Chambers               7863             5517             2339
## # ... with 3,272 more rows, and 12 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

fill NA with 0 in dataset3

dataset2[is.na(dataset2)] <- 0

dataset2

## # A tibble: 3,282 x 18
##    fips_code state area_name high_school_dipl~ some_college_or~ bachelors_degre~
##        <dbl> <chr> <chr>                 <dbl>            <dbl>            <dbl>
##  1      1000 AL    Alabama              877216           746495           549608
##  2      1001 AL    Autauga                9332             7413             4972
##  3      1003 AL    Baldwin               28428            28178            22146
##  4      1005 AL    Barbour                6124             4025             2068
##  5      1007 AL    Bibb                   4838             2756              962
##  6      1009 AL    Blount                12136             8371             3235
##  7      1011 AL    Bullock                2667             1325              586
##  8      1013 AL    Butler                 4749             3146             1433
##  9      1015 AL    Calhoun               23856            19576            11265
## 10      1017 AL    Chambers               7863             5517             2339
## # ... with 3,272 more rows, and 12 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

names(dataset2)[3] <- c('county')

dataset2

## # A tibble: 3,282 x 18
##    fips_code state county  high_school_dipl~ some_college_or_~ bachelors_degree~
##        <dbl> <chr> <chr>               <dbl>             <dbl>             <dbl>
##  1      1000 AL    Alabama            877216            746495            549608
##  2      1001 AL    Autauga              9332              7413              4972
##  3      1003 AL    Baldwin             28428             28178             22146
##  4      1005 AL    Barbour              6124              4025              2068
##  5      1007 AL    Bibb                 4838              2756               962
##  6      1009 AL    Blount              12136              8371              3235
##  7      1011 AL    Bullock              2667              1325               586
##  8      1013 AL    Butler               4749              3146              1433
##  9      1015 AL    Calhoun             23856             19576             11265
## 10      1017 AL    Chambe~              7863              5517              2339
## # ... with 3,272 more rows, and 12 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

# sort by state and county
# sort the dataframe in R using arrange

dataset2 <- arrange(dataset2,state, county)

dataset2

## # A tibble: 3,282 x 18
##    fips_code state county   high_school_dipl~ some_college_or_~ bachelors_degre~
##        <dbl> <chr> <chr>                <dbl>             <dbl>            <dbl>
##  1      2000 AK    Alaska              105812            135655            93807
##  2      2010 AK    Aleutia~                 0                 0                0
##  3      2013 AK    Aleutia~               983               419               98
##  4      2016 AK    Aleutia~              1462              1407              469
##  5      2020 AK    Anchora~             38741             59428            46240
##  6      2050 AK    Bethel ~              3098              1548             1050
##  7      2060 AK    Bristol~               266               264              165
##  8      2068 AK    Denali ~               400               508              299
##  9      2070 AK    Dilling~               900               699              436
## 10      2090 AK    Fairban~             12240             18848            12968
## # ... with 3,272 more rows, and 12 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>

glimpse(dataset2)

## Rows: 3,282
## Columns: 18
## $ fips_code                                                              <dbl> ~
## $ state                                                                  <chr> ~
## $ county                                                                 <chr> ~
## $ high_school_diploma_only_2000                                          <dbl> ~
## $ some_college_or_associates_degree_2000                                 <dbl> ~
## $ bachelors_degree_or_higher_2000                                        <dbl> ~
## $ percent_of_adults_with_less_than_a_high_school_diploma_2000            <dbl> ~
## $ percent_of_adults_with_a_high_school_diploma_only_2000                 <dbl> ~
## $ percent_of_adults_completing_some_college_or_associates_degree_2000    <dbl> ~
## $ percent_of_adults_with_a_bachelors_degree_or_higher_2000               <dbl> ~
## $ less_than_a_high_school_diploma_2015_19                                <dbl> ~
## $ high_school_diploma_only_2015_19                                       <dbl> ~
## $ some_college_or_associates_degree_2015_19                              <dbl> ~
## $ bachelors_degree_or_higher_2015_19                                     <dbl> ~
## $ percent_of_adults_with_less_than_a_high_school_diploma_2015_19         <dbl> ~
## $ percent_of_adults_with_a_high_school_diploma_only_2015_19              <dbl> ~
## $ percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl> ~
## $ percent_of_adults_with_a_bachelors_degree_or_higher_2015_19            <dbl> ~

Dataset 3:

Reading the data from Github:

dataset3 <- read_csv("https://raw.githubusercontent.com/uzmabb182/Data_607_Project2/main/county_fips_data.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Fips = col_double(),
##   County_Name = col_character(),
##   State_Abbr = col_character(),
##   State_Name = col_character()
## )

Tidying/Transforming dataset3

# Now tidying dataset3

head(dataset3)

## # A tibble: 6 x 4
##    Fips County_Name    State_Abbr State_Name
##   <dbl> <chr>          <chr>      <chr>     
## 1  1001 Autauga County AL         Alabama   
## 2  1003 Baldwin County AL         Alabama   
## 3  1005 Barbour County AL         Alabama   
## 4  1007 Bibb County    AL         Alabama   
## 5  1009 Blount County  AL         Alabama   
## 6  1011 Bullock County AL         Alabama

Making column names lowercase

names(dataset3) <- tolower(names(dataset3))    # Convert colnames to lower case
dataset3

## # A tibble: 3,146 x 4
##     fips county_name     state_abbr state_name
##    <dbl> <chr>           <chr>      <chr>     
##  1  1001 Autauga County  AL         Alabama   
##  2  1003 Baldwin County  AL         Alabama   
##  3  1005 Barbour County  AL         Alabama   
##  4  1007 Bibb County     AL         Alabama   
##  5  1009 Blount County   AL         Alabama   
##  6  1011 Bullock County  AL         Alabama   
##  7  1013 Butler County   AL         Alabama   
##  8  1015 Calhoun County  AL         Alabama   
##  9  1017 Chambers County AL         Alabama   
## 10  1019 Cherokee County AL         Alabama   
## # ... with 3,136 more rows

Removing the string ‘county’ from ‘county_name’ column values

dataset3$county_name <-gsub(" County","",as.character(dataset3$county_name))

dataset3

## # A tibble: 3,146 x 4
##     fips county_name state_abbr state_name
##    <dbl> <chr>       <chr>      <chr>     
##  1  1001 Autauga     AL         Alabama   
##  2  1003 Baldwin     AL         Alabama   
##  3  1005 Barbour     AL         Alabama   
##  4  1007 Bibb        AL         Alabama   
##  5  1009 Blount      AL         Alabama   
##  6  1011 Bullock     AL         Alabama   
##  7  1013 Butler      AL         Alabama   
##  8  1015 Calhoun     AL         Alabama   
##  9  1017 Chambers    AL         Alabama   
## 10  1019 Cherokee    AL         Alabama   
## # ... with 3,136 more rows

Rename county_name to county

dataset3 <- rename(dataset3, county = county_name)

dataset3

## # A tibble: 3,146 x 4
##     fips county   state_abbr state_name
##    <dbl> <chr>    <chr>      <chr>     
##  1  1001 Autauga  AL         Alabama   
##  2  1003 Baldwin  AL         Alabama   
##  3  1005 Barbour  AL         Alabama   
##  4  1007 Bibb     AL         Alabama   
##  5  1009 Blount   AL         Alabama   
##  6  1011 Bullock  AL         Alabama   
##  7  1013 Butler   AL         Alabama   
##  8  1015 Calhoun  AL         Alabama   
##  9  1017 Chambers AL         Alabama   
## 10  1019 Cherokee AL         Alabama   
## # ... with 3,136 more rows

Fill NA with 0

dataset3[is.na(dataset3)] <- 0

str(dataset3)

## spec_tbl_df [3,146 x 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ fips      : num [1:3146] 1001 1003 1005 1007 1009 ...
##  $ county    : chr [1:3146] "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ state_abbr: chr [1:3146] "AL" "AL" "AL" "AL" ...
##  $ state_name: chr [1:3146] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Fips = col_double(),
##   ..   County_Name = col_character(),
##   ..   State_Abbr = col_character(),
##   ..   State_Name = col_character()
##   .. )

glimpse(dataset3)

## Rows: 3,146
## Columns: 4
## $ fips       <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1019,~
## $ county     <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bullock~
## $ state_abbr <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ state_name <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala~

Performing left join between Census and FIPS_Code data on ‘County’ data:

joint_df <- dataset2 %>% inner_join(dataset3,by="county")

joint_df

## # A tibble: 14,596 x 21
##    fips_code state county   high_school_dipl~ some_college_or_~ bachelors_degre~
##        <dbl> <chr> <chr>                <dbl>             <dbl>            <dbl>
##  1      2013 AK    Aleutia~               983               419               98
##  2      2016 AK    Aleutia~              1462              1407              469
##  3      2020 AK    Anchora~             38741             59428            46240
##  4      2050 AK    Bethel ~              3098              1548             1050
##  5      2060 AK    Bristol~               266               264              165
##  6      2068 AK    Denali ~               400               508              299
##  7      2070 AK    Dilling~               900               699              436
##  8      2090 AK    Fairban~             12240             18848            12968
##  9      2100 AK    Haines ~               517               564              395
## 10      2105 AK    Hoonah-~                 0                 0                0
## # ... with 14,586 more rows, and 15 more variables:
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## #   fips <dbl>, state_abbr <chr>, state_name <chr>

Now joining Census dataset1 to jointdataset

#combined_df<- merge(dataset1, joint_df, by = 'county', all.x= TRUE)
combined_df = dataset1 %>% inner_join(joint_df,by="county")

combined_df

## # A tibble: 182,340 x 29
##    population county  state.x median_age household_income per_capita_income
##         <dbl> <chr>   <chr>        <dbl>            <dbl>             <dbl>
##  1      55380 Autauga Alabama       38.2            58731             29819
##  2     212830 Baldwin Alabama       43              58320             32626
##  3     212830 Baldwin Alabama       43              58320             32626
##  4     212830 Baldwin Alabama       43              58320             32626
##  5     212830 Baldwin Alabama       43              58320             32626
##  6      25361 Barbour Alabama       40.4            32525             18473
##  7      25361 Barbour Alabama       40.4            32525             18473
##  8      25361 Barbour Alabama       40.4            32525             18473
##  9      25361 Barbour Alabama       40.4            32525             18473
## 10      22493 Bibb    Alabama       40.9            47542             20778
## # ... with 182,330 more rows, and 23 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## #   state.y <chr>, high_school_diploma_only_2000 <dbl>,
## #   some_college_or_associates_degree_2000 <dbl>,
## #   bachelors_degree_or_higher_2000 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## #   fips <dbl>, state_abbr <chr>, state_name <chr>

Use the ‘select’ and ‘one_of’ functions of the dplyr package to delete columns

col_remove <- c("state.y", "fips")     # Define columns that should be dropped

Removing the repeated column

combined_df <- combined_df %>%            # Apply select & one_of functions
  select(- one_of(col_remove))

combined_df

## # A tibble: 182,340 x 27
##    population county  state.x median_age household_income per_capita_income
##         <dbl> <chr>   <chr>        <dbl>            <dbl>             <dbl>
##  1      55380 Autauga Alabama       38.2            58731             29819
##  2     212830 Baldwin Alabama       43              58320             32626
##  3     212830 Baldwin Alabama       43              58320             32626
##  4     212830 Baldwin Alabama       43              58320             32626
##  5     212830 Baldwin Alabama       43              58320             32626
##  6      25361 Barbour Alabama       40.4            32525             18473
##  7      25361 Barbour Alabama       40.4            32525             18473
##  8      25361 Barbour Alabama       40.4            32525             18473
##  9      25361 Barbour Alabama       40.4            32525             18473
## 10      22493 Bibb    Alabama       40.9            47542             20778
## # ... with 182,330 more rows, and 21 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## #   high_school_diploma_only_2000 <dbl>,
## #   some_college_or_associates_degree_2000 <dbl>,
## #   bachelors_degree_or_higher_2000 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## #   state_abbr <chr>, state_name <chr>

Rename state.x column to state

combined_df <- rename(combined_df, state = state.x)

combined_df

## # A tibble: 182,340 x 27
##    population county  state   median_age household_income per_capita_income
##         <dbl> <chr>   <chr>        <dbl>            <dbl>             <dbl>
##  1      55380 Autauga Alabama       38.2            58731             29819
##  2     212830 Baldwin Alabama       43              58320             32626
##  3     212830 Baldwin Alabama       43              58320             32626
##  4     212830 Baldwin Alabama       43              58320             32626
##  5     212830 Baldwin Alabama       43              58320             32626
##  6      25361 Barbour Alabama       40.4            32525             18473
##  7      25361 Barbour Alabama       40.4            32525             18473
##  8      25361 Barbour Alabama       40.4            32525             18473
##  9      25361 Barbour Alabama       40.4            32525             18473
## 10      22493 Bibb    Alabama       40.9            47542             20778
## # ... with 182,330 more rows, and 21 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## #   high_school_diploma_only_2000 <dbl>,
## #   some_college_or_associates_degree_2000 <dbl>,
## #   bachelors_degree_or_higher_2000 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## #   state_abbr <chr>, state_name <chr>

Remove duplicate rows of the dataframe

combined_df <- combined_df[!duplicated(combined_df[,c('county')]),]

combined_df

## # A tibble: 1,873 x 27
##    population county   state   median_age household_income per_capita_income
##         <dbl> <chr>    <chr>        <dbl>            <dbl>             <dbl>
##  1      55380 Autauga  Alabama       38.2            58731             29819
##  2     212830 Baldwin  Alabama       43              58320             32626
##  3      25361 Barbour  Alabama       40.4            32525             18473
##  4      22493 Bibb     Alabama       40.9            47542             20778
##  5      57681 Blount   Alabama       40.7            49358             24747
##  6      10248 Bullock  Alabama       40.2            37785             20877
##  7      19828 Butler   Alabama       40.8            40688             21038
##  8     114618 Calhoun  Alabama       39.6            47255             25345
##  9      33660 Chambers Alabama       42              42289             22729
## 10      25903 Cherokee Alabama       46.5            41919             24301
## # ... with 1,863 more rows, and 21 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## #   high_school_diploma_only_2000 <dbl>,
## #   some_college_or_associates_degree_2000 <dbl>,
## #   bachelors_degree_or_higher_2000 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## #   state_abbr <chr>, state_name <chr>

Statistical Analysis:

summary(combined_df)

##    population          county             state             median_age   
##  Min.   :      66   Length:1873        Length:1873        Min.   :22.30  
##  1st Qu.:   10709   Class :character   Class :character   1st Qu.:37.70  
##  Median :   25903   Mode  :character   Mode  :character   Median :40.90  
##  Mean   :  111472                                         Mean   :41.14  
##  3rd Qu.:   76042                                         3rd Qu.:44.40  
##  Max.   :10081570                                         Max.   :59.70  
##  household_income per_capita_income poverty_count      poverty_rate   
##  Min.   : 22346   Min.   :10388     Min.   :      6   Min.   : 2.356  
##  1st Qu.: 43661   1st Qu.:23448     1st Qu.:   1553   1st Qu.:10.403  
##  Median : 51931   Median :27361     Median :   3941   Median :13.900  
##  Mean   : 53717   Mean   :28164     Mean   :  14575   Mean   :14.790  
##  3rd Qu.: 60403   3rd Qu.:31609     3rd Qu.:  10627   3rd Qu.:18.165  
##  Max.   :142299   Max.   :76592     Max.   :1480446   Max.   :50.780  
##  unemployment_rate   fips_code     high_school_diploma_only_2000
##  Min.   : 0.000    Min.   : 1001   Min.   :      0              
##  1st Qu.: 1.789    1st Qu.:16031   1st Qu.:   2543              
##  Median : 2.330    Median :27077   Median :   5718              
##  Mean   : 2.459    Mean   :28143   Mean   :  21544              
##  3rd Qu.: 2.959    3rd Qu.:42027   3rd Qu.:  14454              
##  Max.   :11.358    Max.   :56045   Max.   :3480768              
##  some_college_or_associates_degree_2000 bachelors_degree_or_higher_2000
##  Min.   :      0                        Min.   :      0                
##  1st Qu.:   1671                        1st Qu.:    869                
##  Median :   3897                        Median :   2233                
##  Mean   :  21034                        Mean   :  18737                
##  3rd Qu.:  12099                        3rd Qu.:   7458                
##  Max.   :3002232                        Max.   :3433212                
##  percent_of_adults_with_less_than_a_high_school_diploma_2000
##  Min.   : 0.00                                              
##  1st Qu.:15.80                                              
##  Median :20.90                                              
##  Mean   :22.72                                              
##  3rd Qu.:29.20                                              
##  Max.   :65.30                                              
##  percent_of_adults_with_a_high_school_diploma_only_2000
##  Min.   : 0.00                                         
##  1st Qu.:29.90                                         
##  Median :34.30                                         
##  Mean   :33.82                                         
##  3rd Qu.:38.20                                         
##  Max.   :53.20                                         
##  percent_of_adults_completing_some_college_or_associates_degree_2000
##  Min.   : 0.00                                                      
##  1st Qu.:22.20                                                      
##  Median :26.40                                                      
##  Mean   :26.32                                                      
##  3rd Qu.:30.30                                                      
##  Max.   :43.10                                                      
##  percent_of_adults_with_a_bachelors_degree_or_higher_2000
##  Min.   : 0.00                                           
##  1st Qu.:11.30                                           
##  Median :14.80                                           
##  Mean   :16.87                                           
##  3rd Qu.:19.90                                           
##  Max.   :63.70                                           
##  less_than_a_high_school_diploma_2015_19 high_school_diploma_only_2015_19
##  Min.   :      4                         Min.   :     14                 
##  1st Qu.:    944                         1st Qu.:   2677                 
##  Median :   2514                         Median :   6520                 
##  Mean   :  11146                         Mean   :  24614                 
##  3rd Qu.:   6196                         3rd Qu.:  16694                 
##  Max.   :1796594                         Max.   :3541274                 
##  some_college_or_associates_degree_2015_19 bachelors_degree_or_higher_2015_19
##  Min.   :     20                           Min.   :      0                   
##  1st Qu.:   2253                           1st Qu.:   1192                   
##  Median :   5423                           Median :   3287                   
##  Mean   :  26738                           Mean   :  29893                   
##  3rd Qu.:  16159                           3rd Qu.:  12179                   
##  Max.   :3308262                           Max.   :4985807                   
##  percent_of_adults_with_less_than_a_high_school_diploma_2015_19
##  Min.   : 1.10                                                 
##  1st Qu.: 8.50                                                 
##  Median :11.70                                                 
##  Mean   :13.28                                                 
##  3rd Qu.:17.10                                                 
##  Max.   :73.60                                                 
##  percent_of_adults_with_a_high_school_diploma_only_2015_19
##  Min.   : 7.30                                            
##  1st Qu.:29.00                                            
##  Median :34.00                                            
##  Mean   :33.59                                            
##  3rd Qu.:38.70                                            
##  Max.   :57.40                                            
##  percent_of_adults_completing_some_college_or_associates_degree_2015_19
##  Min.   : 5.20                                                         
##  1st Qu.:27.30                                                         
##  Median :30.90                                                         
##  Mean   :30.84                                                         
##  3rd Qu.:34.30                                                         
##  Max.   :60.60                                                         
##  percent_of_adults_with_a_bachelors_degree_or_higher_2015_19  state_abbr       
##  Min.   : 0.0                                                Length:1873       
##  1st Qu.:15.2                                                Class :character  
##  Median :19.7                                                Mode  :character  
##  Mean   :22.3                                                                  
##  3rd Qu.:26.9                                                                  
##  Max.   :77.6                                                                  
##   state_name       
##  Length:1873       
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Find the maximun percapita income state

# Get Maximum value of the column by column name
 
max(combined_df$per_capita_income)

## [1] 76592

Move column state to 1st position

combined_df <- combined_df %>% relocate(state, .before = population)

combined_df

## # A tibble: 1,873 x 27
##    state   population county   median_age household_income per_capita_income
##    <chr>        <dbl> <chr>         <dbl>            <dbl>             <dbl>
##  1 Alabama      55380 Autauga        38.2            58731             29819
##  2 Alabama     212830 Baldwin        43              58320             32626
##  3 Alabama      25361 Barbour        40.4            32525             18473
##  4 Alabama      22493 Bibb           40.9            47542             20778
##  5 Alabama      57681 Blount         40.7            49358             24747
##  6 Alabama      10248 Bullock        40.2            37785             20877
##  7 Alabama      19828 Butler         40.8            40688             21038
##  8 Alabama     114618 Calhoun        39.6            47255             25345
##  9 Alabama      33660 Chambers       42              42289             22729
## 10 Alabama      25903 Cherokee       46.5            41919             24301
## # ... with 1,863 more rows, and 21 more variables: poverty_count <dbl>,
## #   poverty_rate <dbl>, unemployment_rate <dbl>, fips_code <dbl>,
## #   high_school_diploma_only_2000 <dbl>,
## #   some_college_or_associates_degree_2000 <dbl>,
## #   bachelors_degree_or_higher_2000 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2000 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2000 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2000 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2000 <dbl>,
## #   less_than_a_high_school_diploma_2015_19 <dbl>,
## #   high_school_diploma_only_2015_19 <dbl>,
## #   some_college_or_associates_degree_2015_19 <dbl>,
## #   bachelors_degree_or_higher_2015_19 <dbl>,
## #   percent_of_adults_with_less_than_a_high_school_diploma_2015_19 <dbl>,
## #   percent_of_adults_with_a_high_school_diploma_only_2015_19 <dbl>,
## #   percent_of_adults_completing_some_college_or_associates_degree_2015_19 <dbl>,
## #   percent_of_adults_with_a_bachelors_degree_or_higher_2015_19 <dbl>,
## #   state_abbr <chr>, state_name <chr>

Group by state and finfing the average of the following indicators

by_state <- combined_df %>% group_by(state)

by_state <- by_state %>% summarise(
  population = mean(population),
  per_capita_income = mean(per_capita_income),
  poverty_count = mean(poverty_count),
  bachelors_degree = mean(bachelors_degree_or_higher_2015_19)
)

by_state

## # A tibble: 51 x 5
##    state              population per_capita_inco~ poverty_count bachelors_degree
##    <chr>                   <dbl>            <dbl>         <dbl>            <dbl>
##  1 Alabama                72780.           24049.        11880.           12623.
##  2 Alaska                 26208.           32964.         2741.            5050.
##  3 Arizona               470020.           24500.        69584.           92968.
##  4 Arkansas               43025.           23487.         7057.           14818.
##  5 California            694827.           33567.        91151.          158552.
##  6 Colorado               90016.           32617.         9383.           24842.
##  7 Connecticut           446884.           42480.        43018.          121933.
##  8 Delaware              319083.           33956         36467.           71379.
##  9 District of Colum~    692683            56147        107140           289259 
## 10 Florida               336279.           28025.        46125.           73899.
## # ... with 41 more rows

To find the highest poverty_count by state

library(ggplot2)

ggplot(by_state, aes(fill=state, y=poverty_count, x=state)) +
  geom_bar(position='dodge', stat='identity',
           color = "black", width = 0.60)

### To find which state exactly has the highest poverty_count in the above barplot since the sate is not detectable easily It shows District of Columbia has the highest poverty count

by_state %>% slice(which.max(poverty_count))

## # A tibble: 1 x 5
##   state               population per_capita_inco~ poverty_count bachelors_degree
##   <chr>                    <dbl>            <dbl>         <dbl>            <dbl>
## 1 District of Columb~     692683            56147        107140           289259

Now repeat the above graph for education

library(ggplot2)

ggplot(by_state, aes(fill=state, y=bachelors_degree, x=state)) +
  geom_bar(position='dodge', stat='identity',
           color = "black", width = 0.60)

### To find which state exactly has the highest education average in the above barplot since the sate is not detectable easily It shows District of Columbia also has the highest bachelors_degree average

by_state %>% slice(which.max(bachelors_degree))

## # A tibble: 1 x 5
##   state               population per_capita_inco~ poverty_count bachelors_degree
##   <chr>                    <dbl>            <dbl>         <dbl>            <dbl>
## 1 District of Columb~     692683            56147        107140           289259

We can clearly see that there is a maximum increase for poverty and education for the state of District of Columbia

# We can that 

by_state %>%
  ggplot(aes(x=poverty_count, 
             y=bachelors_degree, 
             color=state))+
  geom_point()+
  geom_smooth(method="lm",se = FALSE)

## `geom_smooth()` using formula 'y ~ x'

ggsave("add_regression_line_per_state_to_scatterplot_ggplot2.png")

## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'

Data 607-Tidying and Transforming Data