Dataset 1 Location
The csv file is hosted in github for processing.
Dataset 1 CSV
library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/health_spending_per_person.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)
(health_spending <- as.tibble(df) %>%
gather(`1995`:`2010`, key = "year", value = "USD", na.rm = TRUE))
## # A tibble: 3,008 x 3
## country year USD
## * <fctr> <chr> <dbl>
## 1 Albania 1995 27.9
## 2 Algeria 1995 62.1
## 3 Andorra 1995 1392
## 4 Angola 1995 15.6
## 5 Antigua and Barbuda 1995 351
## 6 Argentina 1995 615
## 7 Armenia 1995 25.7
## 8 Australia 1995 1565
## 9 Austria 1995 2856
## 10 Azerbaijan 1995 18.2
## # ... with 2,998 more rows
(max_spending_for_each_country <-health_spending %>%
group_by(country) %>%
summarise(max_spending = max(USD, na.rm = TRUE)))
## # A tibble: 190 x 2
## country max_spending
## <fctr> <dbl>
## 1 Afghanistan 37.7
## 2 Albania 275
## 3 Algeria 186
## 4 Andorra 3391
## 5 Angola 201
## 6 Antigua and Barbuda 690
## 7 Argentina 742
## 8 Armenia 143
## 9 Australia 4775
## 10 Austria 5153
## # ... with 180 more rows
max_spending_for_each_country %>%
mutate(r = min_rank(desc(max_spending))) %>%
arrange(r)
## # A tibble: 190 x 3
## country max_spending r
## <fctr> <dbl> <int>
## 1 United States 8362 1
## 2 Luxembourg 8262 2
## 3 Norway 8091 3
## 4 Switzerland 7812 4
## 5 Monaco 6692 5
## 6 Denmark 6452 6
## 7 Iceland 6175 7
## 8 Netherlands 5751 8
## 9 Ireland 5241 9
## 10 Canada 5222 10
## # ... with 180 more rows
United States has most spending followed by Luxembourg, Norway, Switzerland and Monaco
health_spending %>%
filter(country %in% c("United States", "Luxembourg", "Norway", "Switzerland", "Monaco")) %>%
ggplot(mapping = aes(x = country, y = USD)) +
geom_boxplot()
health_spending %>%
filter(country %in% c("United States", "Luxembourg", "Norway", "Switzerland", "Monaco")) %>%
ggplot(mapping = aes(x = year, y = USD)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))
The general trend for United States is to have increased spending year by year.
Dataset 2 Location
The csv file is hosted in github for processing.
Dataset 2 CSV
library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/movie_rating.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)
(tidy_rating <- as.tibble(df) %>%
select(-Timestamp) %>%
gather(`Guardians of the Galaxy 2`:`Valerian and the City of a Thousand Planets`, key = movie, value = rating))
## # A tibble: 260 x 4
## Name `Number of Movies Seen` movie rati…
## <fctr> <int> <chr> <int>
## 1 Eva 2 Guardians of the Galax… 0
## 2 Mike Gilbert 5 Guardians of the Galax… 5
## 3 Gino 7 Guardians of the Galax… 5
## 4 Heather Ahram 0 Guardians of the Galax… 0
## 5 Diane 1 Guardians of the Galax… 0
## 6 Anon1 4 Guardians of the Galax… 3
## 7 "Ryan Graziano " 4 Guardians of the Galax… 3
## 8 Mercedes 7 Guardians of the Galax… 4
## 9 Kerry 5 Guardians of the Galax… 4
## 10 Samantha Esposito 4 Guardians of the Galax… 4
## # ... with 250 more rows
tidy_rating %>%
group_by(movie) %>%
summarise(av_rating = mean(rating, na.rm = TRUE)) %>%
mutate(r = min_rank(desc(av_rating))) %>%
arrange(r)
## # A tibble: 10 x 3
## movie av_rating r
## <chr> <dbl> <int>
## 1 Wonder Woman 3.12 1
## 2 Star Wars: The Last Jedi 2.92 2
## 3 Guardians of the Galaxy 2 2.73 3
## 4 Thor: Ragnarok 2.08 4
## 5 Spider-Man: Homecoming 1.58 5
## 6 Blade Runner 2049 1.54 6
## 7 War for the Planet of the Apes 0.962 7
## 8 Ghost in the Shell 0.692 8
## 9 Valerian and the City of a Thousand Planets 0.538 9
## 10 Alien: Covenant 0.385 10
Wonder Woman has the max average rating of 3.12 followed by Star Wars:
The Last Jedi with average rating of 2.92
tidy_rating %>%
ggplot() +
stat_summary(mapping = aes(x = movie, y = rating),
fun.ymin = min,
fun.ymax = max,
fun.y = mean) +
coord_flip()
tidy_rating %>%
group_by(Name) %>%
summarize(max_rating = max(rating, na.rm = TRUE)) %>%
arrange(desc(max_rating))
## # A tibble: 26 x 2
## Name max_rating
## <fctr> <dbl>
## 1 Adam 5.00
## 2 Anon2 5.00
## 3 Anon4 5.00
## 4 Betsy 5.00
## 5 Brian 5.00
## 6 Cheri 5.00
## 7 Chris VA 5.00
## 8 Diane 5.00
## 9 Gino 5.00
## 10 Hugo Walker 5.00
## # ... with 16 more rows
tidy_rating %>%
ggplot() +
stat_summary(mapping = aes(x = Name, y = rating),
fun.ymin = min,
fun.ymax = max,
fun.y = mean) +
coord_flip()
Dataset 3 Location
The csv file is hosted in github for processing.
Dataset 3 CSV
library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/unemployment_above_55.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)
tidy_unemploy <- as.tibble(df) %>%
gather(`1981`:`2005`, key = year, value = prec, na.rm = TRUE)
tidy_unemploy %>%
group_by(country) %>%
summarise(max_perc = max(prec, na.rm = TRUE))
## # A tibble: 29 x 2
## country max_perc
## <fctr> <dbl>
## 1 Australia 9.20
## 2 Canada 9.30
## 3 Czech Rep. 5.40
## 4 Estonia 8.50
## 5 Finland 19.8
## 6 France 9.00
## 7 Germany 14.4
## 8 Hong Kong, China 8.20
## 9 Ireland 8.90
## 10 Japan 4.80
## # ... with 19 more rows
tidy_unemploy %>%
group_by(country) %>%
summarise(max_perc = max(prec, na.rm = TRUE)) %>%
arrange(desc(max_perc))
## # A tibble: 29 x 2
## country max_perc
## <fctr> <dbl>
## 1 Latvia 22.8
## 2 Finland 19.8
## 3 Slovak Republic 15.1
## 4 Germany 14.4
## 5 Spain 11.7
## 6 Lithuania 11.5
## 7 Poland 9.70
## 8 Canada 9.30
## 9 Australia 9.20
## 10 France 9.00
## # ... with 19 more rows
Latvia has the maximum unemployment percentage followed by Finland.
tidy_unemploy %>%
ggplot(aes(x = year, y = prec, group = country)) +
geom_jitter() +
coord_flip()