This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
In HW1, we will practice coding with core tidyverse functions by using Employment_by_Congressional_District.csv data set.
library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1 <U+221A> purrr 0.2.4
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.4
## <U+221A> tidyr 0.7.2 <U+221A> stringr 1.2.0
## <U+221A> readr 1.1.1 <U+221A> forcats 0.2.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data <- read.csv("/Users/zhengdongnanzi/Desktop/Employment_by_Congressional_District.csv")
data <- as_tibble(data)
data
## # A tibble: 8,722 x 3
## Congressional_District NAICS_Decription Employment
## <fct> <fct> <int>
## 1 AK 1 Agriculture, forestry, fishing and h~ 819
## 2 AK 1 Mining, quarrying, and oil and gas e~ 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
## # ... with 8,712 more rows
data1 <- filter(data, NAICS_Decription=="Construction")
arrange(data1,desc(Employment))
## # A tibble: 437 x 3
## Congressional_District NAICS_Decription Employment
## <fct> <fct> <int>
## 1 LA 6 Construction 58288
## 2 NY 12 Construction 38877
## 3 TX 24 Construction 34315
## 4 TX 2 Construction 33409
## 5 TX 36 Construction 29717
## 6 TX 7 Construction 28496
## 7 TX 29 Construction 28021
## 8 TX 21 Construction 27954
## 9 CO 7 Construction 27498
## 10 PA 18 Construction 26692
## # ... with 427 more rows
# I currently live in NY 10.
data2 <- filter(data, Congressional_District =="NY 10")
data2 <- arrange(data2,desc(Employment))
head(data2)
## # A tibble: 6 x 3
## Congressional_District NAICS_Decription Employment
## <fct> <fct> <int>
## 1 NY 10 Finance and insurance 109148
## 2 NY 10 Health care and social assistance 102966
## 3 NY 10 Professional, scientific, and technic~ 96054
## 4 NY 10 Accommodation and food services 81321
## 5 NY 10 Educational services 64752
## 6 NY 10 Retail trade 56372
tail(data2)
## # A tibble: 6 x 3
## Congressional_District NAICS_Decription Employment
## <fct> <fct> <int>
## 1 NY 10 Construction 15143
## 2 NY 10 Manufacturing 11789
## 3 NY 10 Transportation and warehousing 10830
## 4 NY 10 Utilities 916
## 5 NY 10 Industries not classified 179
## 6 NY 10 Mining, quarrying, and oil and gas ex~ 0
data3 <- rename(data, CD=Congressional_District)
data3
## # A tibble: 8,722 x 3
## CD NAICS_Decription Employment
## <fct> <fct> <int>
## 1 AK 1 Agriculture, forestry, fishing and hunting 819
## 2 AK 1 Mining, quarrying, and oil and gas extraction 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
## # ... with 8,712 more rows
data4 <- separate(data3,CD,
into = c("state","congressional_district"), sep=" ")
data4
## # A tibble: 8,722 x 4
## state congressional_district NAICS_Decription Employment
## * <chr> <chr> <fct> <int>
## 1 AK 1 Agriculture, forestry, fishing~ 819
## 2 AK 1 Mining, quarrying, and oil and~ 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
## # ... with 8,712 more rows
data5 <- unite(data4, CD_united, state, congressional_district, sep=" ")
data5
## # A tibble: 8,722 x 3
## CD_united NAICS_Decription Employment
## * <chr> <fct> <int>
## 1 AK 1 Agriculture, forestry, fishing and hunting 819
## 2 AK 1 Mining, quarrying, and oil and gas extraction 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
## # ... with 8,712 more rows
# dataset 1)
data6_1 <- select(data,Congressional_District,NAICS_Decription)
data6_1
## # A tibble: 8,722 x 2
## Congressional_District NAICS_Decription
## <fct> <fct>
## 1 AK 1 Agriculture, forestry, fishing and hunting
## 2 AK 1 Mining, quarrying, and oil and gas extraction
## 3 AK 1 Utilities
## 4 AK 1 Construction
## 5 AK 1 Manufacturing
## 6 AK 1 Wholesale trade
## 7 AK 1 Retail trade
## 8 AK 1 Transportation and warehousing
## 9 AK 1 Information
## 10 AK 1 Finance and insurance
## # ... with 8,712 more rows
# dataset 2)
data6_2 <- select(data,Congressional_District,Employment)
data6_2
## # A tibble: 8,722 x 2
## Congressional_District Employment
## <fct> <int>
## 1 AK 1 819
## 2 AK 1 13566
## 3 AK 1 2081
## 4 AK 1 18581
## 5 AK 1 12294
## 6 AK 1 9125
## 7 AK 1 35221
## 8 AK 1 18736
## 9 AK 1 6711
## 10 AK 1 7340
## # ... with 8,712 more rows
left_join(data6_1,data6_2, by="Congressional_District).# Assign unique ID to each column in dataset6_1.
data6_1["ID"] <- 1:8722
data6_1
## # A tibble: 8,722 x 3
## Congressional_District NAICS_Decription ID
## <fct> <fct> <int>
## 1 AK 1 Agriculture, forestry, fishing and hunting 1
## 2 AK 1 Mining, quarrying, and oil and gas extrac~ 2
## 3 AK 1 Utilities 3
## 4 AK 1 Construction 4
## 5 AK 1 Manufacturing 5
## 6 AK 1 Wholesale trade 6
## 7 AK 1 Retail trade 7
## 8 AK 1 Transportation and warehousing 8
## 9 AK 1 Information 9
## 10 AK 1 Finance and insurance 10
## # ... with 8,712 more rows
# Assign unique ID to each column in dataset6_2.
data6_2["ID"] <- 1:8722
data6_2
## # A tibble: 8,722 x 3
## Congressional_District Employment ID
## <fct> <int> <int>
## 1 AK 1 819 1
## 2 AK 1 13566 2
## 3 AK 1 2081 3
## 4 AK 1 18581 4
## 5 AK 1 12294 5
## 6 AK 1 9125 6
## 7 AK 1 35221 7
## 8 AK 1 18736 8
## 9 AK 1 6711 9
## 10 AK 1 7340 10
## # ... with 8,712 more rows
# Join the two datasets with Congressional_District and Unique ID.
data7 <- left_join(data6_1,data6_2, by=c("Congressional_District","ID"))
data7
## # A tibble: 8,722 x 4
## Congressional_District NAICS_Decription ID Employment
## <fct> <fct> <int> <int>
## 1 AK 1 Agriculture, forestry, fishing~ 1 819
## 2 AK 1 Mining, quarrying, and oil and~ 2 13566
## 3 AK 1 Utilities 3 2081
## 4 AK 1 Construction 4 18581
## 5 AK 1 Manufacturing 5 12294
## 6 AK 1 Wholesale trade 6 9125
## 7 AK 1 Retail trade 7 35221
## 8 AK 1 Transportation and warehousing 8 18736
## 9 AK 1 Information 9 6711
## 10 AK 1 Finance and insurance 10 7340
## # ... with 8,712 more rows
# Remove the unique ID we created to revert back to our orginal dataset
data7 <- data7[,-3]
# Answer:
data7
## # A tibble: 8,722 x 3
## Congressional_District NAICS_Decription Employment
## <fct> <fct> <int>
## 1 AK 1 Agriculture, forestry, fishing and h~ 819
## 2 AK 1 Mining, quarrying, and oil and gas e~ 13566
## 3 AK 1 Utilities 2081
## 4 AK 1 Construction 18581
## 5 AK 1 Manufacturing 12294
## 6 AK 1 Wholesale trade 9125
## 7 AK 1 Retail trade 35221
## 8 AK 1 Transportation and warehousing 18736
## 9 AK 1 Information 6711
## 10 AK 1 Finance and insurance 7340
## # ... with 8,712 more rows
data8 <- group_by(data,NAICS_Decription)
data8 <- arrange(summarise (data8, averageemployment = mean(Employment,na.rm=TRUE)),
desc(averageemployment))
data8 %>% print(n = 10, width = Inf)
## # A tibble: 20 x 2
## NAICS_Decription averageemployment
## <fct> <dbl>
## 1 Health care and social assistance 44059
## 2 Retail trade 36221
## 3 Accommodation and food services 30366
## 4 Manufacturing 26725
## 5 Professional, scientific, and technical 19264
## 6 Administrative and support and waste 18205
## 7 Finance and insurance 13783
## 8 Construction 13691
## 9 Wholesale trade 13617
## 10 Other services (except public administration) 12395
## # ... with 10 more rows
data9 <- spread(data,Congressional_District,Employment)
data9 %>% print(n = 10)
## # A tibble: 20 x 438
## NAICS_Decription `AK 1` `AL 1` `AL 2` `AL 3` `AL 4` `AL 5` `AL 6`
## * <fct> <int> <int> <int> <int> <int> <int> <int>
## 1 Accommodation and foo~ 28112 29067 24892 21075 17891 27023 26588
## 2 Administrative and su~ 17628 15559 13053 10887 9185 12414 19994
## 3 Agriculture, forestry~ 819 1196 1762 688 879 189 574
## 4 Arts, entertainment, ~ 4639 2996 3753 1117 966 2472 4495
## 5 Construction 18581 15516 8957 6979 6824 10506 15028
## 6 Educational services 3025 4882 3821 3288 701 3622 5399
## 7 Finance and insurance 7340 7760 6992 4581 5792 6630 21357
## 8 Health care and socia~ 49240 30929 36033 26115 29705 38179 29929
## 9 Industries not classi~ 76 28 89 26 60 30 33
## 10 Information 6711 3997 3416 1968 1960 5520 10993
## # ... with 10 more rows, and 430 more variables: `AL 7` <int>, `AR
## # 1` <int>, `AR 2` <int>, `AR 3` <int>, `AR 4` <int>, `AZ 1` <int>, `AZ
## # 2` <int>, `AZ 3` <int>, `AZ 4` <int>, `AZ 5` <int>, `AZ 6` <int>, `AZ
## # 7` <int>, `AZ 8` <int>, `AZ 9` <int>, `CA 1` <int>, `CA 10` <int>, `CA
## # 11` <int>, `CA 12` <int>, `CA 13` <int>, `CA 14` <int>, `CA 15` <int>,
## # `CA 16` <int>, `CA 17` <int>, `CA 18` <int>, `CA 19` <int>, `CA
## # 2` <int>, `CA 20` <int>, `CA 21` <int>, `CA 22` <int>, `CA 23` <int>,
## # `CA 24` <int>, `CA 25` <int>, `CA 26` <int>, `CA 27` <int>, `CA
## # 28` <int>, `CA 29` <int>, `CA 3` <int>, `CA 30` <int>, `CA 31` <int>,
## # `CA 32` <int>, `CA 33` <int>, `CA 34` <int>, `CA 35` <int>, `CA
## # 36` <int>, `CA 37` <int>, `CA 38` <int>, `CA 39` <int>, `CA 4` <int>,
## # `CA 40` <int>, `CA 41` <int>, `CA 42` <int>, `CA 43` <int>, `CA
## # 44` <int>, `CA 45` <int>, `CA 46` <int>, `CA 47` <int>, `CA 48` <int>,
## # `CA 49` <int>, `CA 5` <int>, `CA 50` <int>, `CA 51` <int>, `CA
## # 52` <int>, `CA 53` <int>, `CA 6` <int>, `CA 7` <int>, `CA 8` <int>,
## # `CA 9` <int>, `CO 1` <int>, `CO 2` <int>, `CO 3` <int>, `CO 4` <int>,
## # `CO 5` <int>, `CO 6` <int>, `CO 7` <int>, `CT 1` <int>, `CT 2` <int>,
## # `CT 3` <int>, `CT 4` <int>, `CT 5` <int>, `DC 98` <int>, `DE 1` <int>,
## # `FL 1` <int>, `FL 10` <int>, `FL 11` <int>, `FL 12` <int>, `FL
## # 13` <int>, `FL 14` <int>, `FL 15` <int>, `FL 16` <int>, `FL 17` <int>,
## # `FL 18` <int>, `FL 19` <int>, `FL 2` <int>, `FL 20` <int>, `FL
## # 21` <int>, `FL 22` <int>, `FL 23` <int>, `FL 24` <int>, `FL 25` <int>,
## # `FL 26` <int>, ...
data10 <- gather(data9,Congressional_District,Employment,-NAICS_Decription)
data10
## # A tibble: 8,740 x 3
## NAICS_Decription Congressional_Di~ Employment
## <fct> <chr> <int>
## 1 Accommodation and food services AK 1 28112
## 2 Administrative and support and waste AK 1 17628
## 3 Agriculture, forestry, fishing and hunting AK 1 819
## 4 Arts, entertainment, and recreation AK 1 4639
## 5 Construction AK 1 18581
## 6 Educational services AK 1 3025
## 7 Finance and insurance AK 1 7340
## 8 Health care and social assistance AK 1 49240
## 9 Industries not classified AK 1 76
## 10 Information AK 1 6711
## # ... with 8,730 more rows