Dataset 1 - Total health spending per person (US$)

Dataset 1 Location
The csv file is hosted in github for processing.
Dataset 1 CSV

Create data frame from CSV

library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/health_spending_per_person.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)

Tidy data

(health_spending <- as.tibble(df) %>%
        gather(`1995`:`2010`, key = "year", value = "USD", na.rm = TRUE))
## # A tibble: 3,008 x 3
##    country             year     USD
##  * <fctr>              <chr>  <dbl>
##  1 Albania             1995    27.9
##  2 Algeria             1995    62.1
##  3 Andorra             1995  1392  
##  4 Angola              1995    15.6
##  5 Antigua and Barbuda 1995   351  
##  6 Argentina           1995   615  
##  7 Armenia             1995    25.7
##  8 Australia           1995  1565  
##  9 Austria             1995  2856  
## 10 Azerbaijan          1995    18.2
## # ... with 2,998 more rows

Find the maximum spending per person for each country

(max_spending_for_each_country <-health_spending %>%
        group_by(country) %>%
        summarise(max_spending = max(USD, na.rm = TRUE))) 
## # A tibble: 190 x 2
##    country             max_spending
##    <fctr>                     <dbl>
##  1 Afghanistan                 37.7
##  2 Albania                    275  
##  3 Algeria                    186  
##  4 Andorra                   3391  
##  5 Angola                     201  
##  6 Antigua and Barbuda        690  
##  7 Argentina                  742  
##  8 Armenia                    143  
##  9 Australia                 4775  
## 10 Austria                   5153  
## # ... with 180 more rows

Find the countries with maximum spending per person

max_spending_for_each_country %>%
        mutate(r = min_rank(desc(max_spending))) %>% 
        arrange(r)
## # A tibble: 190 x 3
##    country       max_spending     r
##    <fctr>               <dbl> <int>
##  1 United States         8362     1
##  2 Luxembourg            8262     2
##  3 Norway                8091     3
##  4 Switzerland           7812     4
##  5 Monaco                6692     5
##  6 Denmark               6452     6
##  7 Iceland               6175     7
##  8 Netherlands           5751     8
##  9 Ireland               5241     9
## 10 Canada                5222    10
## # ... with 180 more rows
United States has most spending followed by Luxembourg, Norway, Switzerland and Monaco

Let’s explore the spending per person for only above countries

health_spending %>%
        filter(country %in% c("United States", "Luxembourg", "Norway", "Switzerland", "Monaco")) %>%
        ggplot(mapping = aes(x = country, y = USD)) +
        geom_boxplot()

Let’s check the general trend in spending for above countries

health_spending %>%
        filter(country %in% c("United States", "Luxembourg", "Norway", "Switzerland", "Monaco")) %>%
        ggplot(mapping = aes(x = year, y = USD)) +
        geom_line(aes(group = country), colour = "grey50") + 
        geom_point(aes(colour = country))

The general trend for United States is to have increased spending year by year.

Dataset 2 - Betsy Rosalen’s Movie Ratings Dataset

Dataset 2 Location
The csv file is hosted in github for processing.
Dataset 2 CSV

Create data frame from CSV

library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/movie_rating.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)

Tidy data

(tidy_rating <- as.tibble(df) %>%
        select(-Timestamp) %>%
        gather(`Guardians of the Galaxy 2`:`Valerian and the City of a Thousand Planets`, key = movie, value = rating))
## # A tibble: 260 x 4
##    Name              `Number of Movies Seen` movie                   rati…
##    <fctr>                              <int> <chr>                   <int>
##  1 Eva                                     2 Guardians of the Galax…     0
##  2 Mike Gilbert                            5 Guardians of the Galax…     5
##  3 Gino                                    7 Guardians of the Galax…     5
##  4 Heather Ahram                           0 Guardians of the Galax…     0
##  5 Diane                                   1 Guardians of the Galax…     0
##  6 Anon1                                   4 Guardians of the Galax…     3
##  7 "Ryan Graziano "                        4 Guardians of the Galax…     3
##  8 Mercedes                                7 Guardians of the Galax…     4
##  9 Kerry                                   5 Guardians of the Galax…     4
## 10 Samantha Esposito                       4 Guardians of the Galax…     4
## # ... with 250 more rows

Find the movie with max average rating

tidy_rating %>%
        group_by(movie) %>%
        summarise(av_rating = mean(rating, na.rm = TRUE)) %>%
        mutate(r = min_rank(desc(av_rating))) %>%
        arrange(r)
## # A tibble: 10 x 3
##    movie                                       av_rating     r
##    <chr>                                           <dbl> <int>
##  1 Wonder Woman                                    3.12      1
##  2 Star Wars: The Last Jedi                        2.92      2
##  3 Guardians of the Galaxy 2                       2.73      3
##  4 Thor: Ragnarok                                  2.08      4
##  5 Spider-Man: Homecoming                          1.58      5
##  6 Blade Runner 2049                               1.54      6
##  7 War for the Planet of the Apes                  0.962     7
##  8 Ghost in the Shell                              0.692     8
##  9 Valerian and the City of a Thousand Planets     0.538     9
## 10 Alien: Covenant                                 0.385    10
Wonder Woman has the max average rating of 3.12 followed by Star Wars: 
The Last Jedi with average rating of 2.92

Let’s do summary computation for movies

tidy_rating %>%
        ggplot() +
        stat_summary(mapping = aes(x = movie, y = rating),
                     fun.ymin = min,
                     fun.ymax = max,
                     fun.y = mean) +
        coord_flip()

Let’s find the maximum rating provided by each evaluator

tidy_rating %>%
        group_by(Name) %>%
        summarize(max_rating = max(rating, na.rm = TRUE)) %>%
        arrange(desc(max_rating))
## # A tibble: 26 x 2
##    Name        max_rating
##    <fctr>           <dbl>
##  1 Adam              5.00
##  2 Anon2             5.00
##  3 Anon4             5.00
##  4 Betsy             5.00
##  5 Brian             5.00
##  6 Cheri             5.00
##  7 Chris VA          5.00
##  8 Diane             5.00
##  9 Gino              5.00
## 10 Hugo Walker       5.00
## # ... with 16 more rows

Do summary computation for movie evaluators

tidy_rating %>%
        ggplot() +
        stat_summary(mapping = aes(x = Name, y = rating),
                     fun.ymin = min,
                     fun.ymax = max,
                     fun.y = mean) +
        coord_flip()

Dataset 3 - Aged 55+ unemployment rate (%)

Dataset 3 Location
The csv file is hosted in github for processing.
Dataset 3 CSV

Create data frame from CSV

library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/unemployment_above_55.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)

Tidy data

tidy_unemploy <- as.tibble(df) %>%
        gather(`1981`:`2005`, key = year, value = prec, na.rm = TRUE)

Find maximum unemployment percentage for each country

tidy_unemploy %>%
        group_by(country) %>%
        summarise(max_perc = max(prec, na.rm = TRUE))
## # A tibble: 29 x 2
##    country          max_perc
##    <fctr>              <dbl>
##  1 Australia            9.20
##  2 Canada               9.30
##  3 Czech Rep.           5.40
##  4 Estonia              8.50
##  5 Finland             19.8 
##  6 France               9.00
##  7 Germany             14.4 
##  8 Hong Kong, China     8.20
##  9 Ireland              8.90
## 10 Japan                4.80
## # ... with 19 more rows

Find the country with max unemployment percentage

tidy_unemploy %>%
        group_by(country) %>%
        summarise(max_perc = max(prec, na.rm = TRUE)) %>%
        arrange(desc(max_perc))
## # A tibble: 29 x 2
##    country         max_perc
##    <fctr>             <dbl>
##  1 Latvia             22.8 
##  2 Finland            19.8 
##  3 Slovak Republic    15.1 
##  4 Germany            14.4 
##  5 Spain              11.7 
##  6 Lithuania          11.5 
##  7 Poland              9.70
##  8 Canada              9.30
##  9 Australia           9.20
## 10 France              9.00
## # ... with 19 more rows
Latvia has the maximum unemployment percentage followed by Finland.

Scatter plot for unemployment percentage

tidy_unemploy %>%
        ggplot(aes(x = year, y = prec, group = country)) +
        geom_jitter() +
        coord_flip()