library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(knitr)
H1B3 <- read.csv("/Users/christinakasman/Desktop/H1B3.csv", na.strings = c("", "NA"))
library(zoo)
H1B3$X[H1B3$X == ""] <- NA
H1B3$X <- na.locf(H1B3$X)
head(as.data.frame(H1B3))
## X X.1 X.2
## 1 USCIS Field Office or Service Center Location <NA> <NA>
## 2 USCIS Field Office or Service Center Location <NA> <NA>
## 3 USCIS Field Office or Service Center Location <NA> <NA>
## 4 USCIS Field Office or Service Center Location <NA> <NA>
## 5 Total <NA> <NA>
## 6 Field Office by State6 <NA> <NA>
## X.3 X.4
## 1 Applications by Category of Admission and Case Status <NA>
## 2 Family-based <NA>
## 3 Applications Received2 Approved3
## 4 <NA> <NA>
## 5 93914 71393
## 6 <NA> <NA>
## X.5 X.6 X.7 X.8
## 1 <NA> <NA> <NA> <NA>
## 2 <NA> <NA> Employment-based received at service center1 <NA>
## 3 Denied4 Pending5 Applications Received2 Approved3
## 4 <NA> <NA> <NA>
## 5 9854 305370 41920 29689
## 6 <NA> <NA> <NA> <NA>
## Number.of.I.485.Applications.to.Register.Permanent.Residence.or.Adjust.Status.by.Category.of...............Admission..Case.Status..and.USCIS.Field.Office.or.Service.Center.Location...............April.1...June.30..2017
## 1 <NA>
## 2 <NA>
## 3 Denied4
## 4 <NA>
## 5 1705
## 6 <NA>
## X.9 X.10 X.11 X.12 X.13
## 1 <NA> <NA> <NA> <NA> <NA>
## 2 <NA> Humanitarian-based1 <NA> <NA> <NA>
## 3 Pending5 Applications Received2 Approved3 Denied4 Pending5
## 4 <NA> <NA> <NA> <NA> <NA>
## 5 148547 47773 36736 1262 122768
## 6 <NA> <NA> <NA> <NA> <NA>
## X.14 X.15 X.16 X.17 X.18
## 1 <NA> <NA> <NA> <NA> <NA>
## 2 Others1 <NA> <NA> <NA> Total
## 3 Applications Received2 Approved3 Denied4 Pending5 Applications Received2
## 4 <NA> <NA> <NA> <NA> <NA>
## 5 11367 7986 931 32356 194974
## 6 <NA> <NA> <NA> <NA> <NA>
## X.19 X.20 X.21 X.22 X.23 X.24 X.25 X.26 X.27 X.28 X.29 X.30
## 1 <NA> <NA> <NA> NA NA NA NA NA NA NA NA NA
## 2 <NA> <NA> <NA> NA NA NA NA NA NA NA NA NA
## 3 Approved3 Denied4 Pending5 NA NA NA NA NA NA NA NA NA
## 4 <NA> <NA> <NA> NA NA NA NA NA NA NA NA NA
## 5 145804 13752 609041 NA NA NA NA NA NA NA NA NA
## 6 <NA> <NA> <NA> NA NA NA NA NA NA NA NA NA
## X.31 X.32 X.33 X.34 X.35 X.36 X.37 X.38 X.39 X.40 X.41 X.42
## 1 NA NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA NA
** I was interested in looking at Family-based Applications received per state**
my.new.data <- H1B3[-c(1:7) , -c(8:44 )]
colnames(my.new.data)[1] <- "State"
colnames(my.new.data)[2] <- "City"
colnames(my.new.data)[3] <- "Code"
colnames(my.new.data)[4] <- "Applications_Received"
colnames(my.new.data)[5] <- "Approved"
colnames(my.new.data)[6] <- "Denied"
colnames(my.new.data)[7] <- "Pending"
my.new.data2 = my.new.data %>% na.omit()
data3 <- my.new.data2 %>% group_by(State) %>%
summarise(Denied = sum(as.numeric(Denied)))
head(data3[order(data3$Denied, decreasing = TRUE),])
## # A tibble: 6 x 2
## State Denied
## <fctr> <dbl>
## 1 California 394
## 2 Florida 344
## 3 New York 170
## 4 Texas 170
## 5 North Carolina 129
## 6 Washington 127
California had the highest number of denied H1B Visas
data3 <- my.new.data2 %>% group_by(State) %>%
summarise(Approved = sum(as.numeric(Approved)))
head(data3[order(data3$Approved, decreasing = TRUE),])
## # A tibble: 6 x 2
## State Approved
## <fctr> <dbl>
## 1 California 509
## 2 Florida 364
## 3 Texas 203
## 4 U.S. Virgin Islands 167
## 5 New York 155
## 6 North Carolina 143
California had the highest number of approved H1B Visas
Data Set 3: Unicef Child Mortality Data
unicef<- read.csv("/Users/christinakasman/Desktop/unicef-u5mr.csv")
unicef2 <- gather(unicef,year, "mortality", U5MR.1950:U5MR.2015)
unicef2 <- unicef2 %>% na.omit()
library(stringr)
yr <- unlist(str_extract_all(unicef2, "R.[[:digit:]]{4}"))
yr <- unlist(str_extract_all(unicef2, "[[:digit:]]{4}"))
unicef3 <- cbind(unicef2, yr)
unicef4 <- unicef3[ , -c(2)]
head(unicef4[order(unicef4$mortality, decreasing = TRUE),])
## CountryName mortality yr
## 2656 Mali 443.5 1963
## 2852 Mali 435.8 1964
## 3048 Mali 428.5 1965
## 3244 Mali 421.8 1966
## 3440 Mali 415.4 1967
## 3636 Mali 409.8 1968
Mali had the highest number of children mortalities in 1963
plot1 <- unicef3 %>% group_by(CountryName) %>%
summarise(total = sum(mortality))
head(plot1[order(plot1$total, decreasing = TRUE),])
## # A tibble: 6 x 2
## CountryName total
## <fctr> <dbl>
## 1 Burkina Faso 16312.6
## 2 Sierra Leone 15005.2
## 3 Mali 14506.3
## 4 Benin 14238.7
## 5 Senegal 13285.3
## 6 Liberia 13215.8
Burkina Faso has the highest number of child mortalities overall
plot <- unicef3 %>% group_by(yr) %>%
summarise(total = sum(mortality))
head(plot[order(plot$total, decreasing = TRUE),])
## # A tibble: 6 x 2
## yr total
## <fctr> <dbl>
## 1 1969 18279.6
## 2 1970 18244.9
## 3 1972 18239.8
## 4 1971 18150.9
## 5 1975 17814.9
## 6 1973 17750.8
1969 had the highest number of Child Mortalities overall
library(ggplot2)
ggplot(plot, aes(yr, total)) + geom_bar(stat = "identity", fill="#009E73") +ggtitle("Child Mortality over time") + theme(
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) + xlab("Year")