library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.3
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'ggplot2' was built under R version 3.3.3
## Warning: package 'tibble' was built under R version 3.3.3
## Warning: package 'tidyr' was built under R version 3.3.3
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'purrr' was built under R version 3.3.3
## Warning: package 'dplyr' was built under R version 3.3.3
## Warning: package 'stringr' was built under R version 3.3.3
## Warning: package 'forcats' was built under R version 3.3.3
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(dplyr)
setwd("/Users/kartr/Documents/R Scripts/HW_1/HW_kt2630")
data<-read.csv("Employment_by_Congressional_District.csv")
# 1) Which district has the most jobs in construction?
filter_construction<-filter(data, NAICS_Decription=="Construction")
## Warning: package 'bindrcpp' was built under R version 3.3.3
ascend_construction<-arrange(filter_construction, desc(Employment))
ascend_construction[1:10, ]
## Congressional_District NAICS_Decription Employment
## 1 LA 6 Construction 58288
## 2 NY 12 Construction 38877
## 3 TX 24 Construction 34315
## 4 TX 2 Construction 33409
## 5 TX 36 Construction 29717
## 6 TX 7 Construction 28496
## 7 TX 29 Construction 28021
## 8 TX 21 Construction 27954
## 9 CO 7 Construction 27498
## 10 PA 18 Construction 26692
#ANSWER: Congressional_District LA 6 has 58,288 employees - it has the largest number of jobs in construction
#2) Find the congressional district that you currently live in using the following website: https://www.govtrack.us/congress/members/map. What industry categories have the highest and lowest employment in your district?
# Morningside Heights falls under New York's 10th Congressional District
filter_NY12<-filter(data, Congressional_District=="NY 12")
ascend_NY12<-arrange(filter_NY12, desc(Employment))
head(ascend_NY12, 10)
## Congressional_District NAICS_Decription
## 1 NY 12 Professional, scientific, and technical
## 2 NY 12 Finance and insurance
## 3 NY 12 Accommodation and food services
## 4 NY 12 Health care and social assistance
## 5 NY 12 Information
## 6 NY 12 Administrative and support and waste
## 7 NY 12 Retail trade
## 8 NY 12 Wholesale trade
## 9 NY 12 Other services (except public administration)
## 10 NY 12 Management of companies and enterprises
## Employment
## 1 220379
## 2 196024
## 3 152731
## 4 136728
## 5 114715
## 6 104186
## 7 99659
## 8 79394
## 9 77789
## 10 71993
ascend_NY12[1:10, ]
## Congressional_District NAICS_Decription
## 1 NY 12 Professional, scientific, and technical
## 2 NY 12 Finance and insurance
## 3 NY 12 Accommodation and food services
## 4 NY 12 Health care and social assistance
## 5 NY 12 Information
## 6 NY 12 Administrative and support and waste
## 7 NY 12 Retail trade
## 8 NY 12 Wholesale trade
## 9 NY 12 Other services (except public administration)
## 10 NY 12 Management of companies and enterprises
## Employment
## 1 220379
## 2 196024
## 3 152731
## 4 136728
## 5 114715
## 6 104186
## 7 99659
## 8 79394
## 9 77789
## 10 71993
top_NYC<-slice(ascend_NY12, 1:1)
bottom_NYC<-slice(ascend_NY12, n():n())
top_bottom <- bind_rows(top_NYC, bottom_NYC)
top_bottom
## # A tibble: 2 x 3
## Congressional_District NAICS_Decription Employment
## <fct> <fct> <int>
## 1 NY 12 Professional, scientific, and technic~ 220379
## 2 NY 12 Mining, quarrying, and oil and gas ex~ 56
#In NYC the highest employment is in Professional scientific, and technical (220,379) and lowest is in mining, quarrying, and oil and gast extraction (56)
#3) Rename the variable "Congressional_District" to "CD"
renamed_data <- rename(data, CD=Congressional_District)
head(renamed_data, 10)
## CD NAICS_Decription Employment
## 1 AK 1 Agriculture, forestry, fishing and hunting 819
## 2 AK 1 Mining, quarrying, and oil and gas extraction 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
#4) Use this variable to create two variables that are called "state" and "congressional district". (hint:separate by splitting the CD variable into two variables, one for two digit state codes and one with district numbers)
separate_data <- renamed_data %>% separate(CD, c("State", "District"), sep = "\\ ")
head(separate_data, 10)
## State District NAICS_Decription Employment
## 1 AK 1 Agriculture, forestry, fishing and hunting 819
## 2 AK 1 Mining, quarrying, and oil and gas extraction 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
#5) Unite these two variables back into a single variable called "CD_united"
united_data <- unite(separate_data, "CD_United", "State", "District", sep = " ")
head(united_data, 10)
## CD_United NAICS_Decription Employment
## 1 AK 1 Agriculture, forestry, fishing and hunting 819
## 2 AK 1 Mining, quarrying, and oil and gas extraction 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
#6) Create two new datasets.
#6.1) a new dataset from the original dataset that includes all data except the Employment variable (be sure to give this dataset a new name).
new_data_1 <- subset(data, select=c("Congressional_District", "NAICS_Decription"))
head(new_data_1, 10)
## Congressional_District NAICS_Decription
## 1 AK 1 Agriculture, forestry, fishing and hunting
## 2 AK 1 Mining, quarrying, and oil and gas extraction
## 3 AK 1 Utilities
## 4 AK 1 Construction
## 5 AK 1 Manufacturing
## 6 AK 1 Wholesale trade
## 7 AK 1 Retail trade
## 8 AK 1 Transportation and warehousing
## 9 AK 1 Information
## 10 AK 1 Finance and insurance
#6.2) a new dataset from the original dataset that only includes Congressional districts and Employment.
new_data_2 <- subset(data, select=c("Congressional_District", "Employment"))
head(new_data_2, 10)
## Congressional_District Employment
## 1 AK 1 819
## 2 AK 1 13566
## 3 AK 1 2081
## 4 AK 1 18581
## 5 AK 1 12294
## 6 AK 1 9125
## 7 AK 1 35221
## 8 AK 1 18736
## 9 AK 1 6711
## 10 AK 1 7340
#6.3) add a new column to both datasets called "mergeid" that includes a sequence of numbers beginning with a 1 in the first row of the data and ending with 8722 in the last row of the data (hint:"df1$mergeid<-seq(1,8722,1)")
new_data_1$mergeid <- seq(1,8722,1)
new_data_2$mergeid <- seq(1,8722,1)
head(new_data_1, 10)
## Congressional_District NAICS_Decription
## 1 AK 1 Agriculture, forestry, fishing and hunting
## 2 AK 1 Mining, quarrying, and oil and gas extraction
## 3 AK 1 Utilities
## 4 AK 1 Construction
## 5 AK 1 Manufacturing
## 6 AK 1 Wholesale trade
## 7 AK 1 Retail trade
## 8 AK 1 Transportation and warehousing
## 9 AK 1 Information
## 10 AK 1 Finance and insurance
## mergeid
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
head(new_data_2, 10)
## Congressional_District Employment mergeid
## 1 AK 1 819 1
## 2 AK 1 13566 2
## 3 AK 1 2081 3
## 4 AK 1 18581 4
## 5 AK 1 12294 5
## 6 AK 1 9125 6
## 7 AK 1 35221 7
## 8 AK 1 18736 8
## 9 AK 1 6711 9
## 10 AK 1 7340 10
#7) Join the two datasets from question six together to recreate the original dataset plus the new merge id.
join_data <- left_join(new_data_1, new_data_2, by=c("mergeid", "Congressional_District"))
head(join_data, 10)
## Congressional_District NAICS_Decription
## 1 AK 1 Agriculture, forestry, fishing and hunting
## 2 AK 1 Mining, quarrying, and oil and gas extraction
## 3 AK 1 Utilities
## 4 AK 1 Construction
## 5 AK 1 Manufacturing
## 6 AK 1 Wholesale trade
## 7 AK 1 Retail trade
## 8 AK 1 Transportation and warehousing
## 9 AK 1 Information
## 10 AK 1 Finance and insurance
## mergeid Employment
## 1 1 819
## 2 2 13566
## 3 3 2081
## 4 4 18581
## 5 5 12294
## 6 6 9125
## 7 7 35221
## 8 8 18736
## 9 9 6711
## 10 10 7340
#8) Group the data by industry categories and then summarise the average number of employment per industry. (arrange from most to least employees)
group_data <- group_by(join_data, NAICS_Decription)
head(group_data, 10)
## # A tibble: 10 x 4
## # Groups: NAICS_Decription [10]
## Congressional_District NAICS_Decription mergeid Employment
## <fct> <fct> <dbl> <int>
## 1 AK 1 Agriculture, forestry, fishi~ 1.00 819
## 2 AK 1 Mining, quarrying, and oil a~ 2.00 13566
## 3 AK 1 Utilities 3.00 2081
## 4 AK 1 Construction 4.00 18581
## 5 AK 1 Manufacturing 5.00 12294
## 6 AK 1 Wholesale trade 6.00 9125
## 7 AK 1 Retail trade 7.00 35221
## 8 AK 1 Transportation and warehousi~ 8.00 18736
## 9 AK 1 Information 9.00 6711
## 10 AK 1 Finance and insurance 10.0 7340
sum_data <- summarise(group_data, Employment=mean(Employment, na.rm=TRUE))
head(sum_data, 10)
## # A tibble: 10 x 2
## NAICS_Decription Employment
## <fct> <dbl>
## 1 Accommodation and food services 30366
## 2 Administrative and support and waste 18205
## 3 Agriculture, forestry, fishing and hunting 366
## 4 Arts, entertainment, and recreation 5112
## 5 Construction 13691
## 6 Educational services 8362
## 7 Finance and insurance 13783
## 8 Health care and social assistance 44059
## 9 Industries not classified 57.1
## 10 Information 7738
#9) Let's reshape the original dataset into a wide format using tidy. Create a wide dataset that keeps industry categories in a single column, but spreads Congressional districts to multiple individual columns with each column delineating a single district. (hint: you should only have one row per industry in the resulting data and employment should be shown as values under each district column)
wide_data <- spread(data, Congressional_District, Employment)
wide_data[1:10, 1:10]
## NAICS_Decription AK 1 AL 1 AL 2 AL 3
## 1 Accommodation and food services 28112 29067 24892 21075
## 2 Administrative and support and waste 17628 15559 13053 10887
## 3 Agriculture, forestry, fishing and hunting 819 1196 1762 688
## 4 Arts, entertainment, and recreation 4639 2996 3753 1117
## 5 Construction 18581 15516 8957 6979
## 6 Educational services 3025 4882 3821 3288
## 7 Finance and insurance 7340 7760 6992 4581
## 8 Health care and social assistance 49240 30929 36033 26115
## 9 Industries not classified 76 28 89 26
## 10 Information 6711 3997 3416 1968
## AL 4 AL 5 AL 6 AL 7 AR 1
## 1 17891 27023 26588 24853 18634
## 2 9185 12414 19994 18710 6227
## 3 879 189 574 1309 1028
## 4 966 2472 4495 1650 1517
## 5 6824 10506 15028 14321 6833
## 6 701 3622 5399 3991 2143
## 7 5792 6630 21357 15218 5742
## 8 29705 38179 29929 50611 37509
## 9 60 30 33 40 56
## 10 1960 5520 10993 4867 6355
#10) Now return the data to a long (tidy) format by moving districts back into a single column and employment in a single column (the final data should include three columns again: one for Congressional districts one for industry categories, and one for employment.)
long_data <- gather(wide_data, Congressional_District, Employment, 2:438)
head(long_data, 10)
## NAICS_Decription Congressional_District
## 1 Accommodation and food services AK 1
## 2 Administrative and support and waste AK 1
## 3 Agriculture, forestry, fishing and hunting AK 1
## 4 Arts, entertainment, and recreation AK 1
## 5 Construction AK 1
## 6 Educational services AK 1
## 7 Finance and insurance AK 1
## 8 Health care and social assistance AK 1
## 9 Industries not classified AK 1
## 10 Information AK 1
## Employment
## 1 28112
## 2 17628
## 3 819
## 4 4639
## 5 18581
## 6 3025
## 7 7340
## 8 49240
## 9 76
## 10 6711
head(data, 10)
## Congressional_District NAICS_Decription
## 1 AK 1 Agriculture, forestry, fishing and hunting
## 2 AK 1 Mining, quarrying, and oil and gas extraction
## 3 AK 1 Utilities
## 4 AK 1 Construction
## 5 AK 1 Manufacturing
## 6 AK 1 Wholesale trade
## 7 AK 1 Retail trade
## 8 AK 1 Transportation and warehousing
## 9 AK 1 Information
## 10 AK 1 Finance and insurance
## Employment
## 1 819
## 2 13566
## 3 2081
## 4 18581
## 5 12294
## 6 9125
## 7 35221
## 8 18736
## 9 6711
## 10 7340