Project - 02

Dataset 1 - Total health spending per person (US$)

Dataset 1 Location
The csv file is hosted in github for processing.
Dataset 1 CSV

Create data frame from CSV

library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/health_spending_per_person.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)

Tidy data

(health_spending <- as.tibble(df) %>%
        gather(`1995`:`2010`, key = "year", value = "USD", na.rm = TRUE))

## # A tibble: 3,008 x 3
##    country             year     USD
##  * <fctr>              <chr>  <dbl>
##  1 Albania             1995    27.9
##  2 Algeria             1995    62.1
##  3 Andorra             1995  1392  
##  4 Angola              1995    15.6
##  5 Antigua and Barbuda 1995   351  
##  6 Argentina           1995   615  
##  7 Armenia             1995    25.7
##  8 Australia           1995  1565  
##  9 Austria             1995  2856  
## 10 Azerbaijan          1995    18.2
## # ... with 2,998 more rows

Find the maximum spending per person for each country

(max_spending_for_each_country <-health_spending %>%
        group_by(country) %>%
        summarise(max_spending = max(USD, na.rm = TRUE)))

## # A tibble: 190 x 2
##    country             max_spending
##    <fctr>                     <dbl>
##  1 Afghanistan                 37.7
##  2 Albania                    275  
##  3 Algeria                    186  
##  4 Andorra                   3391  
##  5 Angola                     201  
##  6 Antigua and Barbuda        690  
##  7 Argentina                  742  
##  8 Armenia                    143  
##  9 Australia                 4775  
## 10 Austria                   5153  
## # ... with 180 more rows

Find the countries with maximum spending per person

max_spending_for_each_country %>%
        mutate(r = min_rank(desc(max_spending))) %>% 
        arrange(r)

## # A tibble: 190 x 3
##    country       max_spending     r
##    <fctr>               <dbl> <int>
##  1 United States         8362     1
##  2 Luxembourg            8262     2
##  3 Norway                8091     3
##  4 Switzerland           7812     4
##  5 Monaco                6692     5
##  6 Denmark               6452     6
##  7 Iceland               6175     7
##  8 Netherlands           5751     8
##  9 Ireland               5241     9
## 10 Canada                5222    10
## # ... with 180 more rows

United States has most spending followed by Luxembourg, Norway, Switzerland and Monaco

Let’s explore the spending per person for only above countries

health_spending %>%
        filter(country %in% c("United States", "Luxembourg", "Norway", "Switzerland", "Monaco")) %>%
        ggplot(mapping = aes(x = country, y = USD)) +
        geom_boxplot()

Let’s check the general trend in spending for above countries

health_spending %>%
        filter(country %in% c("United States", "Luxembourg", "Norway", "Switzerland", "Monaco")) %>%
        ggplot(mapping = aes(x = year, y = USD)) +
        geom_line(aes(group = country), colour = "grey50") + 
        geom_point(aes(colour = country))

The general trend for United States is to have increased spending year by year.

Dataset 2 - Betsy Rosalen’s Movie Ratings Dataset

Dataset 2 Location
The csv file is hosted in github for processing.
Dataset 2 CSV

Create data frame from CSV

library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/movie_rating.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)

Tidy data

(tidy_rating <- as.tibble(df) %>%
        select(-Timestamp) %>%
        gather(`Guardians of the Galaxy 2`:`Valerian and the City of a Thousand Planets`, key = movie, value = rating))

## # A tibble: 260 x 4
##    Name              `Number of Movies Seen` movie                   rati…
##    <fctr>                              <int> <chr>                   <int>
##  1 Eva                                     2 Guardians of the Galax…     0
##  2 Mike Gilbert                            5 Guardians of the Galax…     5
##  3 Gino                                    7 Guardians of the Galax…     5
##  4 Heather Ahram                           0 Guardians of the Galax…     0
##  5 Diane                                   1 Guardians of the Galax…     0
##  6 Anon1                                   4 Guardians of the Galax…     3
##  7 "Ryan Graziano "                        4 Guardians of the Galax…     3
##  8 Mercedes                                7 Guardians of the Galax…     4
##  9 Kerry                                   5 Guardians of the Galax…     4
## 10 Samantha Esposito                       4 Guardians of the Galax…     4
## # ... with 250 more rows

Find the movie with max average rating

tidy_rating %>%
        group_by(movie) %>%
        summarise(av_rating = mean(rating, na.rm = TRUE)) %>%
        mutate(r = min_rank(desc(av_rating))) %>%
        arrange(r)

## # A tibble: 10 x 3
##    movie                                       av_rating     r
##    <chr>                                           <dbl> <int>
##  1 Wonder Woman                                    3.12      1
##  2 Star Wars: The Last Jedi                        2.92      2
##  3 Guardians of the Galaxy 2                       2.73      3
##  4 Thor: Ragnarok                                  2.08      4
##  5 Spider-Man: Homecoming                          1.58      5
##  6 Blade Runner 2049                               1.54      6
##  7 War for the Planet of the Apes                  0.962     7
##  8 Ghost in the Shell                              0.692     8
##  9 Valerian and the City of a Thousand Planets     0.538     9
## 10 Alien: Covenant                                 0.385    10

Wonder Woman has the max average rating of 3.12 followed by Star Wars: 
The Last Jedi with average rating of 2.92

Let’s do summary computation for movies

tidy_rating %>%
        ggplot() +
        stat_summary(mapping = aes(x = movie, y = rating),
                     fun.ymin = min,
                     fun.ymax = max,
                     fun.y = mean) +
        coord_flip()

Let’s find the maximum rating provided by each evaluator

tidy_rating %>%
        group_by(Name) %>%
        summarize(max_rating = max(rating, na.rm = TRUE)) %>%
        arrange(desc(max_rating))

## # A tibble: 26 x 2
##    Name        max_rating
##    <fctr>           <dbl>
##  1 Adam              5.00
##  2 Anon2             5.00
##  3 Anon4             5.00
##  4 Betsy             5.00
##  5 Brian             5.00
##  6 Cheri             5.00
##  7 Chris VA          5.00
##  8 Diane             5.00
##  9 Gino              5.00
## 10 Hugo Walker       5.00
## # ... with 16 more rows

Do summary computation for movie evaluators

tidy_rating %>%
        ggplot() +
        stat_summary(mapping = aes(x = Name, y = rating),
                     fun.ymin = min,
                     fun.ymax = max,
                     fun.y = mean) +
        coord_flip()

Dataset 3 - Aged 55+ unemployment rate (%)

Dataset 3 Location
The csv file is hosted in github for processing.
Dataset 3 CSV

Create data frame from CSV

library(RCurl)
url <- getURL("https://raw.githubusercontent.com/binishkurian/DATA-607/master/project-02/unemployment_above_55.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)

Tidy data

tidy_unemploy <- as.tibble(df) %>%
        gather(`1981`:`2005`, key = year, value = prec, na.rm = TRUE)

Find maximum unemployment percentage for each country

tidy_unemploy %>%
        group_by(country) %>%
        summarise(max_perc = max(prec, na.rm = TRUE))

## # A tibble: 29 x 2
##    country          max_perc
##    <fctr>              <dbl>
##  1 Australia            9.20
##  2 Canada               9.30
##  3 Czech Rep.           5.40
##  4 Estonia              8.50
##  5 Finland             19.8 
##  6 France               9.00
##  7 Germany             14.4 
##  8 Hong Kong, China     8.20
##  9 Ireland              8.90
## 10 Japan                4.80
## # ... with 19 more rows

Find the country with max unemployment percentage

tidy_unemploy %>%
        group_by(country) %>%
        summarise(max_perc = max(prec, na.rm = TRUE)) %>%
        arrange(desc(max_perc))

## # A tibble: 29 x 2
##    country         max_perc
##    <fctr>             <dbl>
##  1 Latvia             22.8 
##  2 Finland            19.8 
##  3 Slovak Republic    15.1 
##  4 Germany            14.4 
##  5 Spain              11.7 
##  6 Lithuania          11.5 
##  7 Poland              9.70
##  8 Canada              9.30
##  9 Australia           9.20
## 10 France              9.00
## # ... with 19 more rows

Latvia has the maximum unemployment percentage followed by Finland.

Scatter plot for unemployment percentage

tidy_unemploy %>%
        ggplot(aes(x = year, y = prec, group = country)) +
        geom_jitter() +
        coord_flip()

Project - 02

Binish Kurian Chandy

3/10/2018

Dataset 1 - Total health spending per person (US$)

Create data frame from CSV

Tidy data

Find the maximum spending per person for each country

Find the countries with maximum spending per person

Let’s explore the spending per person for only above countries

Let’s check the general trend in spending for above countries

Dataset 2 - Betsy Rosalen’s Movie Ratings Dataset

Create data frame from CSV

Tidy data

Find the movie with max average rating

Let’s do summary computation for movies

Let’s find the maximum rating provided by each evaluator

Do summary computation for movie evaluators

Dataset 3 - Aged 55+ unemployment rate (%)

Create data frame from CSV

Tidy data

Find maximum unemployment percentage for each country

Find the country with max unemployment percentage

Scatter plot for unemployment percentage