HW_1

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.3.3

## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## Warning: package 'ggplot2' was built under R version 3.3.3

## Warning: package 'tibble' was built under R version 3.3.3

## Warning: package 'tidyr' was built under R version 3.3.3

## Warning: package 'readr' was built under R version 3.3.3

## Warning: package 'purrr' was built under R version 3.3.3

## Warning: package 'dplyr' was built under R version 3.3.3

## Warning: package 'stringr' was built under R version 3.3.3

## Warning: package 'forcats' was built under R version 3.3.3

## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidyr)
library(dplyr)
setwd("/Users/kartr/Documents/R Scripts/HW_1/HW_kt2630")
data<-read.csv("Employment_by_Congressional_District.csv")

# 1) Which district has the most jobs in construction?
filter_construction<-filter(data, NAICS_Decription=="Construction")

## Warning: package 'bindrcpp' was built under R version 3.3.3

ascend_construction<-arrange(filter_construction, desc(Employment))
ascend_construction[1:10, ]

##    Congressional_District NAICS_Decription Employment
## 1                    LA 6     Construction      58288
## 2                   NY 12     Construction      38877
## 3                   TX 24     Construction      34315
## 4                    TX 2     Construction      33409
## 5                   TX 36     Construction      29717
## 6                    TX 7     Construction      28496
## 7                   TX 29     Construction      28021
## 8                   TX 21     Construction      27954
## 9                    CO 7     Construction      27498
## 10                  PA 18     Construction      26692

#ANSWER: Congressional_District LA 6 has 58,288 employees - it has the largest number of jobs in construction

#2) Find the congressional district that you currently live in using the following website: https://www.govtrack.us/congress/members/map.  What industry categories have the highest and lowest employment in your district?
# Morningside Heights falls under New York's 10th Congressional District
filter_NY12<-filter(data, Congressional_District=="NY 12")
ascend_NY12<-arrange(filter_NY12, desc(Employment))
head(ascend_NY12, 10)

##    Congressional_District                              NAICS_Decription
## 1                   NY 12       Professional, scientific, and technical
## 2                   NY 12                         Finance and insurance
## 3                   NY 12               Accommodation and food services
## 4                   NY 12             Health care and social assistance
## 5                   NY 12                                   Information
## 6                   NY 12          Administrative and support and waste
## 7                   NY 12                                  Retail trade
## 8                   NY 12                               Wholesale trade
## 9                   NY 12 Other services (except public administration)
## 10                  NY 12       Management of companies and enterprises
##    Employment
## 1      220379
## 2      196024
## 3      152731
## 4      136728
## 5      114715
## 6      104186
## 7       99659
## 8       79394
## 9       77789
## 10      71993

ascend_NY12[1:10, ]

##    Congressional_District                              NAICS_Decription
## 1                   NY 12       Professional, scientific, and technical
## 2                   NY 12                         Finance and insurance
## 3                   NY 12               Accommodation and food services
## 4                   NY 12             Health care and social assistance
## 5                   NY 12                                   Information
## 6                   NY 12          Administrative and support and waste
## 7                   NY 12                                  Retail trade
## 8                   NY 12                               Wholesale trade
## 9                   NY 12 Other services (except public administration)
## 10                  NY 12       Management of companies and enterprises
##    Employment
## 1      220379
## 2      196024
## 3      152731
## 4      136728
## 5      114715
## 6      104186
## 7       99659
## 8       79394
## 9       77789
## 10      71993

top_NYC<-slice(ascend_NY12, 1:1)
bottom_NYC<-slice(ascend_NY12, n():n())
top_bottom <- bind_rows(top_NYC, bottom_NYC)
top_bottom

## # A tibble: 2 x 3
##   Congressional_District NAICS_Decription                       Employment
##   <fct>                  <fct>                                       <int>
## 1 NY 12                  Professional, scientific, and technic~     220379
## 2 NY 12                  Mining, quarrying, and oil and gas ex~         56

#In NYC the highest employment is in Professional scientific, and technical (220,379) and lowest is in mining, quarrying, and oil and gast extraction (56)

#3) Rename the variable "Congressional_District" to "CD"
renamed_data <- rename(data, CD=Congressional_District)
head(renamed_data, 10)

##      CD                              NAICS_Decription Employment
## 1  AK 1    Agriculture, forestry, fishing and hunting        819
## 2  AK 1 Mining, quarrying, and oil and gas extraction      13566
## 3  AK 1                                     Utilities       2081
## 4  AK 1                                  Construction      18581
## 5  AK 1                                 Manufacturing      12294
## 6  AK 1                               Wholesale trade       9125
## 7  AK 1                                  Retail trade      35221
## 8  AK 1                Transportation and warehousing      18736
## 9  AK 1                                   Information       6711
## 10 AK 1                         Finance and insurance       7340

#4) Use this variable to create two variables that are called "state" and "congressional district".  (hint:separate by splitting the CD variable into two variables, one for two digit state codes and one with district numbers)
separate_data <- renamed_data %>% separate(CD, c("State", "District"), sep = "\\ ")
head(separate_data, 10)

##    State District                              NAICS_Decription Employment
## 1     AK        1    Agriculture, forestry, fishing and hunting        819
## 2     AK        1 Mining, quarrying, and oil and gas extraction      13566
## 3     AK        1                                     Utilities       2081
## 4     AK        1                                  Construction      18581
## 5     AK        1                                 Manufacturing      12294
## 6     AK        1                               Wholesale trade       9125
## 7     AK        1                                  Retail trade      35221
## 8     AK        1                Transportation and warehousing      18736
## 9     AK        1                                   Information       6711
## 10    AK        1                         Finance and insurance       7340

#5) Unite these two variables back into a single variable called "CD_united"
united_data <- unite(separate_data, "CD_United", "State", "District", sep = " ")
head(united_data, 10)

##    CD_United                              NAICS_Decription Employment
## 1       AK 1    Agriculture, forestry, fishing and hunting        819
## 2       AK 1 Mining, quarrying, and oil and gas extraction      13566
## 3       AK 1                                     Utilities       2081
## 4       AK 1                                  Construction      18581
## 5       AK 1                                 Manufacturing      12294
## 6       AK 1                               Wholesale trade       9125
## 7       AK 1                                  Retail trade      35221
## 8       AK 1                Transportation and warehousing      18736
## 9       AK 1                                   Information       6711
## 10      AK 1                         Finance and insurance       7340

#6) Create two new datasets.
#6.1) a new dataset from the original dataset that includes all data except the Employment variable (be sure to give this dataset a new name).
new_data_1 <- subset(data, select=c("Congressional_District", "NAICS_Decription"))
head(new_data_1, 10)

##    Congressional_District                              NAICS_Decription
## 1                    AK 1    Agriculture, forestry, fishing and hunting
## 2                    AK 1 Mining, quarrying, and oil and gas extraction
## 3                    AK 1                                     Utilities
## 4                    AK 1                                  Construction
## 5                    AK 1                                 Manufacturing
## 6                    AK 1                               Wholesale trade
## 7                    AK 1                                  Retail trade
## 8                    AK 1                Transportation and warehousing
## 9                    AK 1                                   Information
## 10                   AK 1                         Finance and insurance

#6.2) a new dataset from the original dataset that only includes Congressional districts and Employment.
new_data_2 <- subset(data, select=c("Congressional_District", "Employment"))
head(new_data_2, 10)

##    Congressional_District Employment
## 1                    AK 1        819
## 2                    AK 1      13566
## 3                    AK 1       2081
## 4                    AK 1      18581
## 5                    AK 1      12294
## 6                    AK 1       9125
## 7                    AK 1      35221
## 8                    AK 1      18736
## 9                    AK 1       6711
## 10                   AK 1       7340

#6.3) add a new column to both datasets called "mergeid" that includes a sequence of numbers beginning with a 1 in the first row of the data and ending with 8722 in the last row of the data (hint:"df1$mergeid<-seq(1,8722,1)")
new_data_1$mergeid <- seq(1,8722,1)
new_data_2$mergeid <- seq(1,8722,1)
head(new_data_1, 10)

##    Congressional_District                              NAICS_Decription
## 1                    AK 1    Agriculture, forestry, fishing and hunting
## 2                    AK 1 Mining, quarrying, and oil and gas extraction
## 3                    AK 1                                     Utilities
## 4                    AK 1                                  Construction
## 5                    AK 1                                 Manufacturing
## 6                    AK 1                               Wholesale trade
## 7                    AK 1                                  Retail trade
## 8                    AK 1                Transportation and warehousing
## 9                    AK 1                                   Information
## 10                   AK 1                         Finance and insurance
##    mergeid
## 1        1
## 2        2
## 3        3
## 4        4
## 5        5
## 6        6
## 7        7
## 8        8
## 9        9
## 10      10

head(new_data_2, 10)

##    Congressional_District Employment mergeid
## 1                    AK 1        819       1
## 2                    AK 1      13566       2
## 3                    AK 1       2081       3
## 4                    AK 1      18581       4
## 5                    AK 1      12294       5
## 6                    AK 1       9125       6
## 7                    AK 1      35221       7
## 8                    AK 1      18736       8
## 9                    AK 1       6711       9
## 10                   AK 1       7340      10

#7) Join the two datasets from question six together to recreate the original dataset plus the new merge id.
join_data <- left_join(new_data_1, new_data_2, by=c("mergeid", "Congressional_District"))
head(join_data, 10)

##    Congressional_District                              NAICS_Decription
## 1                    AK 1    Agriculture, forestry, fishing and hunting
## 2                    AK 1 Mining, quarrying, and oil and gas extraction
## 3                    AK 1                                     Utilities
## 4                    AK 1                                  Construction
## 5                    AK 1                                 Manufacturing
## 6                    AK 1                               Wholesale trade
## 7                    AK 1                                  Retail trade
## 8                    AK 1                Transportation and warehousing
## 9                    AK 1                                   Information
## 10                   AK 1                         Finance and insurance
##    mergeid Employment
## 1        1        819
## 2        2      13566
## 3        3       2081
## 4        4      18581
## 5        5      12294
## 6        6       9125
## 7        7      35221
## 8        8      18736
## 9        9       6711
## 10      10       7340

#8) Group the data by industry categories and then summarise the average number of employment per industry. (arrange from most to least employees)
group_data <- group_by(join_data, NAICS_Decription)
head(group_data, 10)

## # A tibble: 10 x 4
## # Groups:   NAICS_Decription [10]
##    Congressional_District NAICS_Decription              mergeid Employment
##    <fct>                  <fct>                           <dbl>      <int>
##  1 AK 1                   Agriculture, forestry, fishi~    1.00        819
##  2 AK 1                   Mining, quarrying, and oil a~    2.00      13566
##  3 AK 1                   Utilities                        3.00       2081
##  4 AK 1                   Construction                     4.00      18581
##  5 AK 1                   Manufacturing                    5.00      12294
##  6 AK 1                   Wholesale trade                  6.00       9125
##  7 AK 1                   Retail trade                     7.00      35221
##  8 AK 1                   Transportation and warehousi~    8.00      18736
##  9 AK 1                   Information                      9.00       6711
## 10 AK 1                   Finance and insurance           10.0        7340

sum_data <- summarise(group_data, Employment=mean(Employment, na.rm=TRUE))
head(sum_data, 10)

## # A tibble: 10 x 2
##    NAICS_Decription                           Employment
##    <fct>                                           <dbl>
##  1 Accommodation and food services               30366  
##  2 Administrative and support and waste          18205  
##  3 Agriculture, forestry, fishing and hunting      366  
##  4 Arts, entertainment, and recreation            5112  
##  5 Construction                                  13691  
##  6 Educational services                           8362  
##  7 Finance and insurance                         13783  
##  8 Health care and social assistance             44059  
##  9 Industries not classified                        57.1
## 10 Information                                    7738

#9) Let's reshape the original dataset into a wide format using tidy.  Create a wide dataset that keeps industry categories in a single column, but spreads Congressional districts to multiple individual columns with each column delineating a single district.  (hint: you should only have one row per industry in the resulting data and employment should be shown as values under each district column)
wide_data <- spread(data, Congressional_District, Employment)
wide_data[1:10, 1:10]

##                              NAICS_Decription  AK 1  AL 1  AL 2  AL 3
## 1             Accommodation and food services 28112 29067 24892 21075
## 2        Administrative and support and waste 17628 15559 13053 10887
## 3  Agriculture, forestry, fishing and hunting   819  1196  1762   688
## 4         Arts, entertainment, and recreation  4639  2996  3753  1117
## 5                                Construction 18581 15516  8957  6979
## 6                        Educational services  3025  4882  3821  3288
## 7                       Finance and insurance  7340  7760  6992  4581
## 8           Health care and social assistance 49240 30929 36033 26115
## 9                   Industries not classified    76    28    89    26
## 10                                Information  6711  3997  3416  1968
##     AL 4  AL 5  AL 6  AL 7  AR 1
## 1  17891 27023 26588 24853 18634
## 2   9185 12414 19994 18710  6227
## 3    879   189   574  1309  1028
## 4    966  2472  4495  1650  1517
## 5   6824 10506 15028 14321  6833
## 6    701  3622  5399  3991  2143
## 7   5792  6630 21357 15218  5742
## 8  29705 38179 29929 50611 37509
## 9     60    30    33    40    56
## 10  1960  5520 10993  4867  6355

#10) Now return the data to a long (tidy) format by moving districts back into a single column and employment in a single column (the final data should include three columns again: one for Congressional districts one for industry categories, and one for employment.)
long_data <- gather(wide_data, Congressional_District, Employment, 2:438)
head(long_data, 10)

##                              NAICS_Decription Congressional_District
## 1             Accommodation and food services                   AK 1
## 2        Administrative and support and waste                   AK 1
## 3  Agriculture, forestry, fishing and hunting                   AK 1
## 4         Arts, entertainment, and recreation                   AK 1
## 5                                Construction                   AK 1
## 6                        Educational services                   AK 1
## 7                       Finance and insurance                   AK 1
## 8           Health care and social assistance                   AK 1
## 9                   Industries not classified                   AK 1
## 10                                Information                   AK 1
##    Employment
## 1       28112
## 2       17628
## 3         819
## 4        4639
## 5       18581
## 6        3025
## 7        7340
## 8       49240
## 9          76
## 10       6711

head(data, 10)

##    Congressional_District                              NAICS_Decription
## 1                    AK 1    Agriculture, forestry, fishing and hunting
## 2                    AK 1 Mining, quarrying, and oil and gas extraction
## 3                    AK 1                                     Utilities
## 4                    AK 1                                  Construction
## 5                    AK 1                                 Manufacturing
## 6                    AK 1                               Wholesale trade
## 7                    AK 1                                  Retail trade
## 8                    AK 1                Transportation and warehousing
## 9                    AK 1                                   Information
## 10                   AK 1                         Finance and insurance
##    Employment
## 1         819
## 2       13566
## 3        2081
## 4       18581
## 5       12294
## 6        9125
## 7       35221
## 8       18736
## 9        6711
## 10       7340

HW_1

kt2630

February 7, 2018