Choose any three of the “wide” datasets identified in the Week 6 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!) - load library
library(dplyr,warn.conflicts = FALSE)
options(dplyr.summarise.inform = FALSE)
library(tidyr)
library(knitr)
library(ggplot2)
library(stringr)
library(DT)
School diversity
The data is collected so that all the different races are in the same column and their proportions are in another column
Question: what is most popular race in school diversity
fileSource <- read.table("https://raw.githubusercontent.com/szx868/data607/master/school_diversity.csv",sep=',',header=TRUE)
head(fileSource)
## X LEAID LEA_NAME ST d_Locale_Txt SCHOOL_YEAR AIAN
## 1 1 100002 alabama youth services AL <NA> 1994-1995 0.00000000
## 2 2 100005 albertville city AL town-distant 1994-1995 0.00000000
## 3 3 100005 albertville city AL town-distant 2016-2017 0.29373967
## 4 4 100006 marshall county AL rural-distant 1994-1995 0.10436857
## 5 5 100006 marshall county AL rural-distant 2016-2017 0.49235098
## 6 6 100007 hoover city AL city-small 1994-1995 0.06518055
## Asian Black Hispanic White Multi Total diverse
## 1 0.5893910 71.7092338 0.1964637 27.50491 NA 509 Diverse
## 2 0.3207184 1.2828736 4.5221296 93.87428 NA 3118 Extremely undiverse
## 3 0.5507619 3.1944189 46.7413255 46.77804 2.441711 5447 Diverse
## 4 0.1341882 0.3727449 0.9094975 98.47920 NA 6707 Extremely undiverse
## 5 0.2989274 1.0726218 21.2941797 75.80447 1.037454 5687 Undiverse
## 6 1.6034415 6.0357189 0.5475166 91.74814 NA 7671 Extremely undiverse
## variance int_group
## 1 NA <NA>
## 2 NA <NA>
## 3 0.01155608 Highly integrated
## 4 NA <NA>
## 5 NA <NA>
## 6 NA <NA>
names(fileSource) <- c("ID","School.ID", "School.name","State", "Locale","YearsOfSchool","AIAN","Asian","Black","Hispanic","White","Other","Total.pop","Diverse","variance","Int.group")
head(fileSource)
## ID School.ID School.name State Locale YearsOfSchool
## 1 1 100002 alabama youth services AL <NA> 1994-1995
## 2 2 100005 albertville city AL town-distant 1994-1995
## 3 3 100005 albertville city AL town-distant 2016-2017
## 4 4 100006 marshall county AL rural-distant 1994-1995
## 5 5 100006 marshall county AL rural-distant 2016-2017
## 6 6 100007 hoover city AL city-small 1994-1995
## AIAN Asian Black Hispanic White Other Total.pop
## 1 0.00000000 0.5893910 71.7092338 0.1964637 27.50491 NA 509
## 2 0.00000000 0.3207184 1.2828736 4.5221296 93.87428 NA 3118
## 3 0.29373967 0.5507619 3.1944189 46.7413255 46.77804 2.441711 5447
## 4 0.10436857 0.1341882 0.3727449 0.9094975 98.47920 NA 6707
## 5 0.49235098 0.2989274 1.0726218 21.2941797 75.80447 1.037454 5687
## 6 0.06518055 1.6034415 6.0357189 0.5475166 91.74814 NA 7671
## Diverse variance Int.group
## 1 Diverse NA <NA>
## 2 Extremely undiverse NA <NA>
## 3 Diverse 0.01155608 Highly integrated
## 4 Extremely undiverse NA <NA>
## 5 Undiverse NA <NA>
## 6 Extremely undiverse NA <NA>
data <- filter(fileSource, Total.pop >=100)
data <- tidyr::gather(fileSource, "race", "percent.rate", 8:12)
head(data)
## ID School.ID School.name State Locale YearsOfSchool
## 1 1 100002 alabama youth services AL <NA> 1994-1995
## 2 2 100005 albertville city AL town-distant 1994-1995
## 3 3 100005 albertville city AL town-distant 2016-2017
## 4 4 100006 marshall county AL rural-distant 1994-1995
## 5 5 100006 marshall county AL rural-distant 2016-2017
## 6 6 100007 hoover city AL city-small 1994-1995
## AIAN Total.pop Diverse variance Int.group race
## 1 0.00000000 509 Diverse NA <NA> Asian
## 2 0.00000000 3118 Extremely undiverse NA <NA> Asian
## 3 0.29373967 5447 Diverse 0.01155608 Highly integrated Asian
## 4 0.10436857 6707 Extremely undiverse NA <NA> Asian
## 5 0.49235098 5687 Undiverse NA <NA> Asian
## 6 0.06518055 7671 Extremely undiverse NA <NA> Asian
## percent.rate
## 1 0.5893910
## 2 0.3207184
## 3 0.5507619
## 4 0.1341882
## 5 0.2989274
## 6 1.6034415
part1 <- filter(data,percent.rate !="NA" )
head(part1)
## ID School.ID School.name State Locale YearsOfSchool
## 1 1 100002 alabama youth services AL <NA> 1994-1995
## 2 2 100005 albertville city AL town-distant 1994-1995
## 3 3 100005 albertville city AL town-distant 2016-2017
## 4 4 100006 marshall county AL rural-distant 1994-1995
## 5 5 100006 marshall county AL rural-distant 2016-2017
## 6 6 100007 hoover city AL city-small 1994-1995
## AIAN Total.pop Diverse variance Int.group race
## 1 0.00000000 509 Diverse NA <NA> Asian
## 2 0.00000000 3118 Extremely undiverse NA <NA> Asian
## 3 0.29373967 5447 Diverse 0.01155608 Highly integrated Asian
## 4 0.10436857 6707 Extremely undiverse NA <NA> Asian
## 5 0.49235098 5687 Undiverse NA <NA> Asian
## 6 0.06518055 7671 Extremely undiverse NA <NA> Asian
## percent.rate
## 1 0.5893910
## 2 0.3207184
## 3 0.5507619
## 4 0.1341882
## 5 0.2989274
## 6 1.6034415
part2 <-
data.frame(part1 %>% dplyr::filter(Total.pop >= 100 ) %>%
dplyr::group_by(race) %>% dplyr::summarise(percent.rate))
head(part2)
## race percent.rate
## 1 Asian 0.5893910
## 2 Asian 0.3207184
## 3 Asian 0.5507619
## 4 Asian 0.1341882
## 5 Asian 0.2989274
## 6 Asian 1.6034415
target_mean <-
data.frame(part2 %>% dplyr::group_by(race) %>%
summarise(mean = mean(percent.rate)) %>%
arrange(desc(mean))
)
kable(head(target_mean, 6), format="markdown")
| race | mean |
|---|---|
| White | 76.206361 |
| Hispanic | 10.787878 |
| Black | 6.965477 |
| Other | 3.181411 |
| Asian | 1.866266 |
target_mean_ordered <- dplyr::arrange(target_mean, mean)
ggplot(data = target_mean_ordered[1:5,], aes(x=reorder(race,-mean), y=mean)) +
geom_bar(stat="identity", width=0.5, fill="lightblue",
position=position_dodge()) +
geom_text(aes(label=round(mean, digits=2)), hjust=1.3, size=3.0, color="white") +
coord_flip() +
ggtitle("School Diversity") +
xlab("") + ylab("Population percent Rate") +
theme_minimal()
White seems like have most population rate in school diversity
This dataset relating income to social factors such as Age, Education, race etc. The Us Adult income dataset was extracted by Barry Becker from the 1994 US Census Database. The data set consists of anonymous information such as occupation, age, native country, race, capital gain, capital loss, education, work class and more. Each row is labelled as either having a salary greater than “>50K” or “<=50K”. https://www.kaggle.com/johnolafenwa/us-census-data
Question: What age group have more chance to have salary greater than 50k?
fileSource <- read.table("https://raw.githubusercontent.com/szx868/data607/master/Project2/adult-training.csv",sep=',')
head(fileSource)
## V1 V2 V3 V4 V5 V6
## 1 39 State-gov 77516 Bachelors 13 Never-married
## 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 38 Private 215646 HS-grad 9 Divorced
## 4 53 Private 234721 11th 7 Married-civ-spouse
## 5 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 37 Private 284582 Masters 14 Married-civ-spouse
## V7 V8 V9 V10 V11 V12 V13 V14
## 1 Adm-clerical Not-in-family White Male 2174 0 40 United-States
## 2 Exec-managerial Husband White Male 0 0 13 United-States
## 3 Handlers-cleaners Not-in-family White Male 0 0 40 United-States
## 4 Handlers-cleaners Husband Black Male 0 0 40 United-States
## 5 Prof-specialty Wife Black Female 0 0 40 Cuba
## 6 Exec-managerial Wife White Female 0 0 40 United-States
## V15
## 1 <=50K
## 2 <=50K
## 3 <=50K
## 4 <=50K
## 5 <=50K
## 6 <=50K
names(fileSource) <- c("age", "Employed","Number", "Education","YearsOfSchool","MaritalStatus","Profession","Relation","Race","Gender","CapitalGain","CapitalLoss","HoursPerWeek","BirthPlace","Salary")
head(fileSource)
## age Employed Number Education YearsOfSchool MaritalStatus
## 1 39 State-gov 77516 Bachelors 13 Never-married
## 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 38 Private 215646 HS-grad 9 Divorced
## 4 53 Private 234721 11th 7 Married-civ-spouse
## 5 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 37 Private 284582 Masters 14 Married-civ-spouse
## Profession Relation Race Gender CapitalGain CapitalLoss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## HoursPerWeek BirthPlace Salary
## 1 40 United-States <=50K
## 2 13 United-States <=50K
## 3 40 United-States <=50K
## 4 40 United-States <=50K
## 5 40 Cuba <=50K
## 6 40 United-States <=50K
part1 <-
data.frame(fileSource %>% dplyr::filter(Salary == " >50K" ) %>%
dplyr::group_by(age) %>% dplyr::summarise(length(Salary)))
head(part1)
## age length.Salary.
## 1 19 2
## 2 21 3
## 3 22 13
## 4 23 12
## 5 24 31
## 6 25 53
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
agebreaks <- c(0,1,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,500)
agelabels <- c("0-1","1-4","5-9","10-14","15-19","20-24","25-29","30-34",
"35-39","40-44","45-49","50-54","55-59","60-64","65-69",
"70-74","75-79","80-84","85+")
setDT(part1)[ , agegroups := cut(part1$age,
breaks = agebreaks,
right = FALSE,
labels = agelabels)]
part2 <-
data.frame(part1 %>%
dplyr::group_by(agegroups) %>% dplyr::summarise(sum(length.Salary.)))
part2
## agegroups sum.length.Salary..
## 1 15-19 2
## 2 20-24 59
## 3 25-29 450
## 4 30-34 977
## 5 35-39 1332
## 6 40-44 1371
## 7 45-49 1291
## 8 50-54 1057
## 9 55-59 650
## 10 60-64 376
## 11 65-69 163
## 12 70-74 66
## 13 75-79 31
## 14 80-84 8
## 15 85+ 8
target <-
data.frame(part2 %>%arrange(desc(sum.length.Salary..))
)
target
## agegroups sum.length.Salary..
## 1 40-44 1371
## 2 35-39 1332
## 3 45-49 1291
## 4 50-54 1057
## 5 30-34 977
## 6 55-59 650
## 7 25-29 450
## 8 60-64 376
## 9 65-69 163
## 10 70-74 66
## 11 20-24 59
## 12 75-79 31
## 13 80-84 8
## 14 85+ 8
## 15 15-19 2
It appears that person who are 40 to 44 years old have most population to earn more than 50K
This dataset providing recent updates of global economic developments, with coverage of unemployment rate of countries.
fileSource <- "https://raw.githubusercontent.com/szx868/data607/master/Project2/Unemployment_Rate.csv"
data = read.table(fileSource, header=TRUE, sep=",", na.strings = c("", "NA"))
head(data)
## ï.. Advanced.Economies Argentina Australia Austria Belgium Bulgaria
## 1 NA NA NA NA NA NA NA
## 2 1991 6.747347 NA 9.614137 5.823096 6.439812 NA
## 3 1992 7.531875 NA 10.750080 5.941711 7.088092 13.23500
## 4 1993 7.958122 NA 10.866170 6.811381 8.619130 15.85583
## 5 1994 7.732834 NA 9.705695 6.545480 9.753554 14.06583
## 6 1995 7.279099 NA 8.471058 6.589767 9.674164 11.38583
## Bahrain Belarus Brazil Canada Switzerland Chile China Colombia Cyprus
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA 10.316670 1.090451 NA NA NA NA
## 3 NA NA NA 11.216670 2.563105 NA NA NA NA
## 4 NA NA NA 11.375000 4.516116 NA NA NA NA
## 5 NA NA NA 10.391670 4.718465 NA NA NA NA
## 6 NA NA NA 9.466667 4.232892 NA NA NA NA
## Czech.Republic Germany Denmark Dominican.Republic Algeria
## 1 NA NA NA NA NA
## 2 NA 4.864885 NA NA 25.0
## 3 NA 5.764563 NA NA 27.0
## 4 4.325000 6.931370 NA NA 23.2
## 5 4.300000 7.340639 NA NA 24.4
## 6 4.033333 7.091997 NA NA 28.1
## EMDE.East.Asia...Pacific EMDE.Europe...Central.Asia Ecuador Egypt..Arab.Rep.
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA 9.906594 NA NA
## Emerging.Market.and.Developing.Economies..EMDEs. Spain Estonia Finland
## 1 NA NA NA NA
## 2 NA 15.51667 1.475 6.666424
## 3 NA 17.06667 3.725 11.796830
## 4 NA 20.83333 6.550 16.384210
## 5 NA 22.05000 7.550 16.534420
## 6 NA 20.79167 9.750 15.426480
## France United.Kingdom Greece High.Income.Countries Hong.Kong.SAR..China
## 1 NA NA NA NA NA
## 2 7.800 8.825000 NA 6.733582 1.749540
## 3 8.650 9.966667 NA 7.649890 1.947085
## 4 9.650 10.400000 NA 8.225122 1.980375
## 5 10.250 9.500000 NA 8.124612 1.910873
## 6 9.675 8.658333 NA 7.625508 3.039959
## Croatia Hungary India Ireland Iceland Israel Italy Jordan Japan
## 1 NA NA NA NA NA NA NA NA NA
## 2 NA NA NA 14.73333 NA NA NA NA 2.099018
## 3 NA NA NA 15.40000 NA NA NA NA 2.151389
## 4 NA NA NA 15.63333 NA NA NA 19.7 2.503291
## 5 NA NA NA 14.35000 NA NA NA 15.8 2.890953
## 6 NA NA NA 12.28333 NA NA NA 15.3 3.153574
## Kazakhstan Korea..Rep. EMDE.Latin.America...Caribbean
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## Low.Income.Countries..LIC. Sri.Lanka Lithuania Luxembourg Latvia Morocco
## 1 NA NA NA NA NA NA
## 2 NA 14.7 NA NA NA NA
## 3 NA 14.6 NA NA NA NA
## 4 NA 13.8 4.191667 NA NA NA
## 5 NA 13.1 3.625000 NA NA NA
## 6 NA 12.3 6.116667 2.600765 NA NA
## Mexico Middle.Income.Countries..MIC. North.Macedonia Malta
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## EMDE.Middle.East...N..Africa Netherlands Norway New.Zealand Pakistan Peru
## 1 NA NA NA NA NA NA
## 2 NA NA 6.041667 10.611440 6.28 NA
## 3 NA NA 6.550000 10.644730 5.85 NA
## 4 NA NA 6.608333 9.800159 4.73 NA
## 5 NA NA 6.000000 8.342465 4.84 NA
## 6 NA NA 5.441667 6.451948 5.37 NA
## Philippines Poland Portugal Romania Russian.Federation EMDE.South.Asia
## 1 NA NA NA NA NA NA
## 2 10.475 9.008333 NA NA NA NA
## 3 9.850 12.933330 NA 5.450000 NA NA
## 4 9.350 15.033330 NA 9.208333 NA NA
## 5 9.550 16.508330 NA 10.975000 7.006540 NA
## 6 9.500 15.225000 7.150996 9.975000 8.308333 NA
## Saudi.Arabia Singapore EMDE.Sub.Saharan.Africa Slovakia Slovenia Sweden
## 1 NA NA NA NA NA NA
## 2 NA 1.750 NA 7.05000 NA 4.005607
## 3 NA 1.800 NA 11.31833 11.56667 7.110956
## 4 NA 1.675 NA 12.85500 14.57500 11.146890
## 5 NA 1.725 NA 14.62917 14.55000 10.766190
## 6 NA 1.725 NA 13.68083 14.04167 10.421390
## Thailand Tunisia Turkey Taiwan..China Uruguay United.States Venezuela..RB
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA 1.533333 NA 6.850000 NA
## 3 NA NA NA 1.500000 NA 7.491667 NA
## 4 NA NA NA 1.425000 NA 6.908333 NA
## 5 NA NA NA 1.566667 NA 6.100000 NA
## 6 NA NA NA 1.808333 NA 5.591667 NA
## Vietnam World..WBG.members. South.Africa
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
dim(data)
## [1] 31 84
data <- data[-1,]
# rename rownames
row.names(data) <- c(1:30)
# rename first column to "year"
colnames(data)[1] <- "year"
# get column names for countries
column_names <- colnames(data)
# clean up column names for each country
column_names <-
#replace the ".." with underscore character
sapply(column_names, stringr::str_replace_all, pattern="[.]{2,2}",replacement= "_") %>%
#replace the period at the very end of the string with a blank character
sapply(stringr::str_replace_all, pattern="[.]{1,1}$",replacement= "") %>%
#replace "." with underscore character
sapply(stringr::str_replace_all, pattern="[.]{1,1}",replacement= "_") %>%
sapply(stringr::str_replace_all, pattern="[_]{2,2}",replacement= "_")
#update country column names with cleaner names
colnames(data) <- column_names
#preview data in wide format
kable(head(data[,1:10],10), format="markdown")
| year | Advanced_Economies | Argentina | Australia | Austria | Belgium | Bulgaria | Bahrain | Belarus | Brazil |
|---|---|---|---|---|---|---|---|---|---|
| 1991 | 6.747347 | NA | 9.614137 | 5.823096 | 6.439812 | NA | NA | NA | NA |
| 1992 | 7.531875 | NA | 10.750080 | 5.941711 | 7.088092 | 13.23500 | NA | NA | NA |
| 1993 | 7.958122 | NA | 10.866170 | 6.811381 | 8.619130 | 15.85583 | NA | NA | NA |
| 1994 | 7.732834 | NA | 9.705695 | 6.545480 | 9.753554 | 14.06583 | NA | NA | NA |
| 1995 | 7.279099 | NA | 8.471058 | 6.589767 | 9.674164 | 11.38583 | NA | NA | NA |
| 1996 | 7.246764 | NA | 8.516425 | 7.033851 | 9.544254 | 11.06167 | NA | NA | NA |
| 1997 | 6.952976 | NA | 8.363718 | 7.103283 | 9.213055 | 14.04583 | NA | NA | NA |
| 1998 | 6.584465 | NA | 7.651377 | 7.184796 | 9.338434 | 12.20333 | NA | NA | NA |
| 1999 | 6.263949 | NA | 6.869885 | 6.645249 | 8.409715 | 13.78250 | NA | NA | NA |
| 2000 | 5.770125 | NA | 6.273074 | 5.803798 | 6.874202 | 18.12917 | NA | NA | NA |
data <- tidyr::gather(data, "country_name", "annual_unemployment", 2:84)
kable(head(data, 10), format="markdown")
| year | country_name | annual_unemployment |
|---|---|---|
| 1991 | Advanced_Economies | 6.747347 |
| 1992 | Advanced_Economies | 7.531875 |
| 1993 | Advanced_Economies | 7.958122 |
| 1994 | Advanced_Economies | 7.732834 |
| 1995 | Advanced_Economies | 7.279099 |
| 1996 | Advanced_Economies | 7.246764 |
| 1997 | Advanced_Economies | 6.952976 |
| 1998 | Advanced_Economies | 6.584465 |
| 1999 | Advanced_Economies | 6.263949 |
| 2000 | Advanced_Economies | 5.770125 |
exclude_countries = c(
"Advanced_Economies",
"Egypt_Arab_Rep",
"High_Income_Countries",
"North_Macedonia",
"EMDE_Middle_East_N_Africa",
"EMDE_Sub_Saharan_Africa",
"World_WBG_members",
"South_Africa"
)
#only include years from 2011 to 2019 and exclude countries that are not actual countries
part1 <-
data %>% dplyr::filter(year >= 2011 & year <= 2019) %>%
dplyr::filter(!(country_name %in% exclude_countries))
dplyr::filter(data, country_name %in% exclude_countries) %>% dplyr::distinct(country_name)
## country_name
## 1 Advanced_Economies
## 2 Egypt_Arab_Rep
## 3 High_Income_Countries
## 4 North_Macedonia
## 5 EMDE_Middle_East_N_Africa
## 6 EMDE_Sub_Saharan_Africa
## 7 World_WBG_members
## 8 South_Africa
part2 <-
data.frame(part1 %>% dplyr::filter(year >= 2011 & year <= 2019) %>%
dplyr::group_by(country_name) %>% dplyr::summarise(length(which(is.na(annual_unemployment)))))
#rename column to "NA_count" for readability
colnames(part2) <- c("country_name", "NA_count")
part3 <- dplyr::filter(part2, NA_count == 0)
#only include country names with NA_count = 0.
#countries with NA_count of 0 are ones with complete data for years 2011 to 2019.
target <-
part1 %>% dplyr::filter(country_name %in% part3$country_name)
target_mean <-
data.frame(target %>% dplyr::group_by(country_name) %>%
summarise(mean = mean(annual_unemployment)) %>%
arrange(desc(mean))
)
kable(head(target_mean, 10), format="markdown")
| country_name | mean |
|---|---|
| Greece | 22.54346 |
| Spain | 20.56574 |
| Croatia | 15.22963 |
| Jordan | 14.88889 |
| Cyprus | 11.81481 |
| Portugal | 11.56960 |
| Italy | 11.01602 |
| Slovenia | 10.99259 |
| Latvia | 10.64777 |
| Turkey | 10.36204 |
target_mean_ordered <- dplyr::arrange(target_mean, mean)
ggplot(data = target_mean_ordered[1:10,], aes(x=reorder(country_name,-mean), y=mean)) +
geom_bar(stat="identity", width=0.5, fill="lightblue",
position=position_dodge()) +
geom_text(aes(label=round(mean, digits=2)), hjust=1.3, size=3.0, color="white") +
coord_flip() +
ggtitle("2011 - 2019: Top 10 Average Annual Unemployment Rates Countries") +
xlab("") + ylab("Unemployment Rate") +
theme_minimal()
Greece seems like have poor employment rate in recent year from 2011 to 2019