1. Import libraries

library(stringr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('scales')
## Warning: package 'scales' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

R Markdown

Animal Species Data:

The data was downloaded from the following site: https://ndownloader.figshare.com/files/2292169

References: https://r4ds.had.co.nz/tidy-data.html

        https://www.google.com/search?q=R+gather()+expalined&ie=&oe=

        https://datacarpentry.org
        
        https://uoftcoders.github.io/rcourse/lec04-dplyr.html            

dplyr functions:

select(): subset columns
filter(): subset rows on conditions
mutate(): create new columns by using information from other columns
group_by() and summarize(): create summary statisitcs on grouped data
arrange(): sort results
count(): count discrete values

tidyr functions:

spread(): opposite of gathering.You use it when an observation is scattered across multiple rows.
gather(): collects a set of column names and places them into a single "key" column.

turn columns into rows (gather()), turn rows into columns (spread()), turn a character column into multiple columns (separate()), turn multiple character columns into a single column (unite())

Retrieve Data File for Dataset 1.

dataset0 <- read.csv(file="https://raw.githubusercontent.com/AjayArora35/Data-607-Project-2/master/combined.csv", header=TRUE)
head(dataset0)
##   record_id month day year plot_id species_id sex hindfoot_length weight
## 1         1     7  16 1977       2         NL   M              32     NA
## 2        72     8  19 1977       2         NL   M              31     NA
## 3       224     9  13 1977       2         NL                  NA     NA
## 4       266    10  16 1977       2         NL                  NA     NA
## 5       349    11  12 1977       2         NL                  NA     NA
## 6       363    11  12 1977       2         NL                  NA     NA
##     genus  species   taxa plot_type
## 1 Neotoma albigula Rodent   Control
## 2 Neotoma albigula Rodent   Control
## 3 Neotoma albigula Rodent   Control
## 4 Neotoma albigula Rodent   Control
## 5 Neotoma albigula Rodent   Control
## 6 Neotoma albigula Rodent   Control

Data Clean – Get rid of NAs and any blank data

dataset1 <- dataset0 %>% filter(!is.na(hindfoot_length)) %>% filter(!is.na(sex)) %>% filter(!is.na(weight)) %>% filter(!(sex == '')) 

Analysis of various aspects of animal data

What is the total number of data elements for each sex?

res1 <- dataset1 %>%  count(sex)
#For debugging res1
ggplot(res1,
       aes(x=res1$n,
           y=res1$sex, fill = res1$sex
           ))+
    geom_bar(stat="identity", position="dodge")

##What is the Alphabetical order of all species?

dataset1 %>%  count(sex, species) %>%  arrange(species, desc(n)) %>%  head(5) %>% kable() %>%  kable_styling()
sex species n
F albigula 606
M albigula 439
F baileyi 1617
M baileyi 1186
M eremicus 659

What is the min, mean and max for all species?

dataset1 %>% group_by(species_id) %>%
    summarize(
        mean_hindfoot_length = mean(hindfoot_length),
        min_hindfoot_length = min(hindfoot_length),
        max_hindfoot_length = max(hindfoot_length),
        n = n()
    ) %>%  head(5) %>% kable() %>%  kable_styling()
species_id mean_hindfoot_length min_hindfoot_length max_hindfoot_length n
BA 13.00000 6 16 45
DM 35.99054 16 50 9727
DO 35.58781 26 64 2790
DS 49.99259 39 58 2023
NL 32.24880 21 42 1045

What is the heaviest animal per year?

dataset1 %>%    group_by(year) %>%    filter(weight == max(weight)) %>%    select(year, genus, species, weight) %>%    arrange(year)  %>%  head(5) %>% kable() %>%  kable_styling()
year genus species weight
1977 Dipodomys spectabilis 149
1978 Neotoma albigula 232
1978 Neotoma albigula 232
1979 Neotoma albigula 274
1980 Neotoma albigula 241

What is the animal weight in killograms and pounds?

dataset1 %>% mutate(weight_kg = weight / 1000, weight_lb = weight_kg * 2.2) %>%  head(5) %>% kable() %>%  kable_styling()
record_id month day year plot_id species_id sex hindfoot_length weight genus species taxa plot_type weight_kg weight_lb
845 5 6 1978 2 NL M 32 204 Neotoma albigula Rodent Control 0.204 0.4488
1164 8 5 1978 2 NL M 34 199 Neotoma albigula Rodent Control 0.199 0.4378
1261 9 4 1978 2 NL M 32 197 Neotoma albigula Rodent Control 0.197 0.4334
1756 4 29 1979 2 NL M 33 166 Neotoma albigula Rodent Control 0.166 0.3652
1818 5 30 1979 2 NL M 32 184 Neotoma albigula Rodent Control 0.184 0.4048

What is the average weight by sex and species id?

res2 <- dataset1 %>% group_by(sex, species_id) %>%  summarize(mean_weight = mean(weight, na.rm = TRUE))

ggplot(res2,
       aes(x=res2$species_id,
           y=res2$mean_weight, fill = res2$sex
           ))+
    geom_bar(stat="identity", position="dodge")

What is the sorted list of data by mean_weight in descending order?

dataset1 %>%  group_by(sex, species_id) %>%  summarize(mean_weight = mean(weight), min_weight = min(weight)) %>%  arrange(desc(mean_weight))   %>%  head(5) %>% kable() %>%  kable_styling()
sex species_id mean_weight min_weight
M NL 164.87927 30
F NL 154.36964 32
M DS 122.88410 12
F DS 117.39224 45
F SH 79.50704 30

What is mean_weight by genus?

dataset1_01 <- dataset1 %>% group_by(genus, plot_id) %>% summarize(mean_weight = mean(weight)) 
dataset1_01 %>%  head(5) %>% kable() %>%  kable_styling()
genus plot_id mean_weight
Baiomys 1 7.000000
Baiomys 2 6.000000
Baiomys 3 8.611111
Baiomys 5 7.750000
Baiomys 18 9.500000

What is the mean_weight by plot_id and genus?

dataset1_01_spread <- dataset1_01 %>%  spread(key = genus, value = mean_weight, fill=0)
dataset1_01_spread %>%  head(5) %>% kable() %>%  kable_styling()
plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus Reithrodontomys Sigmodon
1 7.000000 22.35312 59.41196 157.2200 27.63194 9.709677 22.15385 11.37500 0.00000
2 6.000000 25.15205 55.24145 166.6391 26.62917 7.071429 22.15698 10.63235 65.77778
3 8.611111 24.73460 51.56915 159.7439 26.02381 7.504348 21.44762 10.53769 66.50000
4 0.000000 22.90909 56.41931 159.6000 27.55833 7.943548 22.60000 10.27778 82.00000
5 7.750000 17.96364 50.87455 191.6250 26.86127 8.743590 21.16049 11.12381 82.66667

What is mean_weight by plot_id?

dataset1_02_gather <- dataset1_01_spread %>% gather(key = genus, value = mean_weight, -plot_id) 
dataset1_02_gather %>% head(5) %>% kable() %>%  kable_styling()
plot_id genus mean_weight
1 Baiomys 7.000000
2 Baiomys 6.000000
3 Baiomys 8.611111
4 Baiomys 0.000000
5 Baiomys 7.750000

Dataset 2 – Steel Imports Data

Reference: https://www.census.gov

       https://www.census.gov/foreign-trade/Press-Release/2018pr/03/steel/steel2bf.xls
       

Retrieve Data File for Dataset 2.

#https://stackoverflow.com/questions/39110755/skip-specific-rows-using-read-csv-in-r
dataset2 <- read.csv(file="https://raw.githubusercontent.com/AjayArora35/Data-607-Project-2/master/steel2bf (2).csv", header=FALSE, stringsAsFactors = FALSE, skip=15)
head(dataset2,5)%>% kable() %>%  kable_styling()
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V40 V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V60 V61 V62 V63 V64 V65 V66 V67 V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78 V79 V80 V81 V82 V83 V84 V85 V86 V87 V88 V89 V90 V91 V92 V93 V94 V95 V96 V97 V98 V99 V100 V101 V102 V103 V104 V105 V106 V107 V108 V109 V110 V111 V112 V113 V114 V115 V116 V117 V118 V119 V120 V121 V122 V123 V124 V125 V126 V127 V128 V129 V130 V131 V132 V133 V134 V135 V136 V137 V138 V139 V140 V141 V142 V143 V144 V145 V146 V147 V148 V149 V150 V151 V152 V153 V154 V155 V156 V157 V158 V159 V160 V161 V162 V163 V164 V165 V166 V167 V168 V169 V170 V171 V172 V173 V174 V175 V176 V177 V178 V179 V180 V181 V182 V183 V184 V185 V186 V187 V188 V189 V190 V191 V192 V193 V194 V195 V196 V197 V198 V199 V200 V201 V202 V203 V204 V205 V206 V207 V208 V209 V210 V211 V212 V213 V214 V215 V216 V217 V218 V219 V220 V221 V222 V223 V224 V225 V226 V227 V228 V229 V230 V231 V232 V233 V234
North America 11,284 35,667 8,652 29,088 9,599 29,928 10,007 28,129 29,862 97,002 28,825 86,937 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Canada 4,265 22,062 4,018 19,742 3,560 18,850 2,795 15,745 12,541 62,596 8,993 51,388 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Mexico 7,019 13,604 4,634 9,346 6,039 11,078 7,212 12,383 17,321 34,406 19,832 35,549 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Europe 47,170 183,656 40,246 156,209 41,862 164,523 33,255 124,861 129,763 497,867 110,278 416,014 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
European Union 45,085 177,054 38,807 150,295 39,659 156,743 30,980 117,190 122,900 477,364 104,035 391,818 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

Data Clean

Get the first 47 observations X 13 variables

#https://stats.idre.ucla.edu/r/modules/subsetting-data/
dataset2 <- dataset2[1:47,1:13]

head(dataset2,5)%>% kable() %>%  kable_styling()
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
North America 11,284 35,667 8,652 29,088 9,599 29,928 10,007 28,129 29,862 97,002 28,825 86,937
Canada 4,265 22,062 4,018 19,742 3,560 18,850 2,795 15,745 12,541 62,596 8,993 51,388
Mexico 7,019 13,604 4,634 9,346 6,039 11,078 7,212 12,383 17,321 34,406 19,832 35,549
Europe 47,170 183,656 40,246 156,209 41,862 164,523 33,255 124,861 129,763 497,867 110,278 416,014
European Union 45,085 177,054 38,807 150,295 39,659 156,743 30,980 117,190 122,900 477,364 104,035 391,818

Get rid of (-) present in the data; replace with NA

dataset2[dataset2 =='(-)'] <- NA
head(dataset2,5)%>% kable() %>%  kable_styling()
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
North America 11,284 35,667 8,652 29,088 9,599 29,928 10,007 28,129 29,862 97,002 28,825 86,937
Canada 4,265 22,062 4,018 19,742 3,560 18,850 2,795 15,745 12,541 62,596 8,993 51,388
Mexico 7,019 13,604 4,634 9,346 6,039 11,078 7,212 12,383 17,321 34,406 19,832 35,549
Europe 47,170 183,656 40,246 156,209 41,862 164,523 33,255 124,861 129,763 497,867 110,278 416,014
European Union 45,085 177,054 38,807 150,295 39,659 156,743 30,980 117,190 122,900 477,364 104,035 391,818

Give names to columns

names(dataset2) <- c('Countries','032018_quantity','032018_value','022018_quantity','022018_value',
                                 '032017_quantity','032017_value', '022017_quantity','022017_value',
                                 '03122018_quantity','03122018_value', '03122017_quantity','03122017_value')
head(dataset2,5)%>% kable() %>%  kable_styling()
Countries 032018_quantity 032018_value 022018_quantity 022018_value 032017_quantity 032017_value 022017_quantity 022017_value 03122018_quantity 03122018_value 03122017_quantity 03122017_value
North America 11,284 35,667 8,652 29,088 9,599 29,928 10,007 28,129 29,862 97,002 28,825 86,937
Canada 4,265 22,062 4,018 19,742 3,560 18,850 2,795 15,745 12,541 62,596 8,993 51,388
Mexico 7,019 13,604 4,634 9,346 6,039 11,078 7,212 12,383 17,321 34,406 19,832 35,549
Europe 47,170 183,656 40,246 156,209 41,862 164,523 33,255 124,861 129,763 497,867 110,278 416,014
European Union 45,085 177,054 38,807 150,295 39,659 156,743 30,980 117,190 122,900 477,364 104,035 391,818

Get rid of all the commas in the numeric values and convert to numeric data type

dataset2$`032018_quantity` <- as.numeric(gsub(",","",dataset2$`032018_quantity`))
dataset2$`032018_value` <- as.numeric(gsub(",","",dataset2$`032018_value`))
dataset2$`022018_quantity` <- as.numeric(gsub(",","",dataset2$`022018_quantity`))
dataset2$`022018_value` <- as.numeric(gsub(",","",dataset2$`022018_value`))
dataset2$`032017_quantity` <- as.numeric(gsub(",","",dataset2$`032017_quantity`))
dataset2$`032017_value` <- as.numeric(gsub(",","",dataset2$`032017_value`))
dataset2$`022017_quantity` <- as.numeric(gsub(",","",dataset2$`022017_quantity`))
dataset2$`022017_value` <- as.numeric(gsub(",","",dataset2$`022017_value`))
dataset2$`03122018_quantity` <- as.numeric(gsub(",","",dataset2$`03122018_quantity`))
dataset2$`03122018_value` <- as.numeric(gsub(",","",dataset2$`03122018_value`))
dataset2$`03122017_quantity` <- as.numeric(gsub(",","",dataset2$`03122017_quantity`))
dataset2$`03122017_value` <- as.numeric(gsub(",","",dataset2$`03122017_value`))

#head(dataset2)

#https://stackoverflow.com/questions/45576805/how-to-replace-all-na-in-a-dataframe-using-tidyrreplace-na
tempList <- setNames(lapply(vector("list", ncol(dataset2)), function(x) x <- 0), names(dataset2))

# Now use that list in tidyr::replace_na 
dataset2 <- dataset2 %>% replace_na(tempList)
head(dataset2, 5)%>% kable() %>%  kable_styling()
Countries 032018_quantity 032018_value 022018_quantity 022018_value 032017_quantity 032017_value 022017_quantity 022017_value 03122018_quantity 03122018_value 03122017_quantity 03122017_value
North America 11284 35667 8652 29088 9599 29928 10007 28129 29862 97002 28825 86937
Canada 4265 22062 4018 19742 3560 18850 2795 15745 12541 62596 8993 51388
Mexico 7019 13604 4634 9346 6039 11078 7212 12383 17321 34406 19832 35549
Europe 47170 183656 40246 156209 41862 164523 33255 124861 129763 497867 110278 416014
European Union 45085 177054 38807 150295 39659 156743 30980 117190 122900 477364 104035 391818

Data Analysis

dataset2_final = gather(dataset2,month_year_type, Number,2:13, factor_key = TRUE) %>% separate(month_year_type, c('month_year','quant_value'),sep='_') %>%
                spread(month_year,'Number', convert=TRUE)
head(dataset2_final, 5)%>% kable() %>%  kable_styling()
Countries quant_value 022017 022018 03122017 03122018 032017 032018
Argentina quantity 0 3 0 3 0 0
Argentina value 0 14 0 14 0 0
Australia quantity 15 56 52 97 0 20
Australia value 149 576 556 1008 6 206
Austria quantity 600 1404 2261 4438 1095 1630
dataset2_final$`032018` <- as.numeric(gsub(",","",dataset2_final$`032018`))
dataset2_final$`022017` <- as.numeric(gsub(",","",dataset2_final$`022017`))
dataset2_final$`022018` <- as.numeric(gsub(",","",dataset2_final$`022018`))
dataset2_final$`03122017` <- as.numeric(gsub(",","",dataset2_final$`03122017`))
dataset2_final$`03122018` <- as.numeric(gsub(",","",dataset2_final$`03122018`))
dataset2_final$`032017` <- as.numeric(gsub(",","",dataset2_final$`032017`))

final_quantity <- dataset2_final %>% filter(quant_value =='quantity') %>% rowwise() %>% mutate(totalquantity = sum(`032018`,`03122018`,`032017`, `03122017`, na.rm=TRUE)) %>% arrange(desc(totalquantity))

head(final_quantity, 5)%>% kable() %>%  kable_styling()
Countries quant_value 022017 022018 03122017 03122018 032017 032018 totalquantity
Europe quantity 33255 40246 110278 129763 41862 47170 329073
European Union quantity 30980 38807 104035 122900 39659 45085 311679
Euro Area quantity 22224 31129 78507 94874 29769 34848 237998
Pacific Rim Countries quantity 22381 45401 71704 102824 25457 28455 228440
Other Countries quantity 10822 9097 33141 30908 12567 11108 87724

What is the distribution of Countries and Quantities?

ggplot(final_quantity,
       aes(x=final_quantity$totalquantity,
           y=final_quantity$Countries, fill = final_quantity$totalquantity
           ))+
  geom_point()

What is the distribution of Countries and Values?

final_value <- dataset2_final %>% filter(quant_value =='value') %>% rowwise() %>% mutate(totalvalue = sum(`032018`,`03122018`,`032017`, `03122017`, na.rm=TRUE)) %>% arrange(desc(totalvalue))

head(final_value, 5)%>% kable() %>%  kable_styling()
Countries quant_value 022017 022018 03122017 03122018 032017 032018 totalvalue
Europe value 124861 156209 416014 497867 164523 183656 1262060
European Union value 117190 150295 391818 477364 156743 177054 1202979
Euro Area value 88961 120555 301994 370970 117668 137157 927789
Pacific Rim Countries value 81213 137556 262328 350504 95804 108106 816742
Germany value 36296 39553 117740 126494 45631 45670 335535
ggplot(final_value,
       aes(x=final_value$totalvalue,
           y=final_value$Countries, fill = final_value$totalvalue
           ))+
  geom_point()

What is the distribution ratio of Countries?

final_ratio <- final_quantity$totalquantity/final_value$totalvalue
#For debugging final_ratio

ggplot(final_value,
       aes(x=final_ratio,
           y=final_value$Countries
           ))+
  geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).

Dataset 3 – GapMinder Wide Dataset

Reference: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/GJQNEQ

Retrieve Data File for Dataset 3.

dataset3 <- read.csv(file="https://raw.githubusercontent.com/AjayArora35/Data-607-Project-2/master/gapminder_wide (1).csv", header=TRUE, stringsAsFactors = FALSE)
head(dataset3,5) 
##   continent      country gdpPercap_1952 gdpPercap_1957 gdpPercap_1962
## 1    Africa      Algeria      2449.0082      3013.9760      2550.8169
## 2    Africa       Angola      3520.6103      3827.9405      4269.2767
## 3    Africa        Benin      1062.7522       959.6011       949.4991
## 4    Africa     Botswana       851.2411       918.2325       983.6540
## 5    Africa Burkina Faso       543.2552       617.1835       722.5120
##   gdpPercap_1967 gdpPercap_1972 gdpPercap_1977 gdpPercap_1982
## 1      3246.9918       4182.664       4910.417      5745.1602
## 2      5522.7764       5473.288       3008.647      2756.9537
## 3      1035.8314       1085.797       1029.161      1277.8976
## 4      1214.7093       2263.611       3214.858      4551.1421
## 5       794.8266        854.736        743.387       807.1986
##   gdpPercap_1987 gdpPercap_1992 gdpPercap_1997 gdpPercap_2002
## 1      5681.3585      5023.2166       4797.295       5288.040
## 2      2430.2083      2627.8457       2277.141       2773.287
## 3      1225.8560      1191.2077       1232.975       1372.878
## 4      6205.8839      7954.1116       8647.142      11003.605
## 5       912.0631       931.7528        946.295       1037.645
##   gdpPercap_2007 lifeExp_1952 lifeExp_1957 lifeExp_1962 lifeExp_1967
## 1       6223.367       43.077       45.685       48.303       51.407
## 2       4797.231       30.015       31.999       34.000       35.985
## 3       1441.285       38.223       40.358       42.618       44.885
## 4      12569.852       47.622       49.618       51.520       53.298
## 5       1217.033       31.975       34.906       37.814       40.697
##   lifeExp_1972 lifeExp_1977 lifeExp_1982 lifeExp_1987 lifeExp_1992
## 1       54.518       58.014       61.368       65.799       67.744
## 2       37.928       39.483       39.942       39.906       40.647
## 3       47.014       49.190       50.904       52.337       53.919
## 4       56.024       59.319       61.484       63.622       62.745
## 5       43.591       46.137       48.122       49.557       50.260
##   lifeExp_1997 lifeExp_2002 lifeExp_2007 pop_1952 pop_1957 pop_1962
## 1       69.152       70.994       72.301  9279525 10270856 11000948
## 2       40.963       41.003       42.731  4232095  4561361  4826015
## 3       54.777       54.406       56.728  1738315  1925173  2151895
## 4       52.556       46.634       50.728   442308   474639   512764
## 5       50.324       50.650       52.295  4469979  4713416  4919632
##   pop_1967 pop_1972 pop_1977 pop_1982 pop_1987 pop_1992 pop_1997 pop_2002
## 1 12760499 14760787 17152804 20033753 23254956 26298373 29072015 31287142
## 2  5247469  5894858  6162675  7016384  7874230  8735988  9875024 10866106
## 3  2427334  2761407  3168267  3641603  4243788  4981671  6066080  7026113
## 4   553541   619351   781472   970347  1151184  1342614  1536536  1630347
## 5  5127935  5433886  5889574  6634596  7586551  8878303 10352843 12251209
##   pop_2007
## 1 33333216
## 2 12420476
## 3  8078314
## 4  1639131
## 5 14326203

Data Analysis

dataset3_long <- dataset3 %>% 
  gather(key   = obstype_year,
         value = values,
         -continent, -country) %>%
  separate(obstype_year,
           into = c('type','year'),
           sep = "_",
           convert = TRUE) #this ensures that the year column is an integer rather than a character

head(dataset3_long,5)
##   continent      country      type year    values
## 1    Africa      Algeria gdpPercap 1952 2449.0082
## 2    Africa       Angola gdpPercap 1952 3520.6103
## 3    Africa        Benin gdpPercap 1952 1062.7522
## 4    Africa     Botswana gdpPercap 1952  851.2411
## 5    Africa Burkina Faso gdpPercap 1952  543.2552

What is distribution of GDP per/cap per country?

ggplot(dataset3_long,
       aes(x=dataset3_long$values,
           y=dataset3_long$country
           ))+
  geom_point()

What is the distribution of GDP per/year per continent?

bycontinents <- dataset3_long %>% group_by(continent, year) %>% summarize(bycontinents_mean = mean(values)) 
head(bycontinents, 5)
## # A tibble: 5 x 3
## # Groups:   continent [1]
##   continent  year bycontinents_mean
##   <chr>     <int>             <dbl>
## 1 Africa     1952          1523767.
## 2 Africa     1957          1698153.
## 3 Africa     1962          1901296.
## 4 Africa     1967          2149990.
## 5 Africa     1972          2435921.
ggplot(bycontinents,
       aes(x=bycontinents$year,
           y=bycontinents$bycontinents_mean, fill = bycontinents$continent
           ))+
    geom_bar(stat="identity", position="dodge")

What is the distribution of Life Expectancy per year/per continent?

bylifeexp <- dataset3_long %>%   filter(type == "lifeExp") %>%   group_by(continent, year) %>% summarize(bylifeexp_mean = mean(values)) 


head(bylifeexp,5)
## # A tibble: 5 x 3
## # Groups:   continent [1]
##   continent  year bylifeexp_mean
##   <chr>     <int>          <dbl>
## 1 Africa     1952           39.1
## 2 Africa     1957           41.3
## 3 Africa     1962           43.3
## 4 Africa     1967           45.3
## 5 Africa     1972           47.5

What is distribution of GDP per/cap per country?

ggplot(bylifeexp,
       aes(x=bylifeexp$year,
           y=bylifeexp$bylifeexp_mean, fill = bylifeexp$continent
           ))+
  geom_bar(stat="identity", position="dodge")

What is the distribution of Population per year/per continent?

bypop <- dataset3_long %>%   filter(type == "pop") %>%   group_by(continent, year) %>% summarize(bypop_mean = mean(values)) 


head(bypop,5)
## # A tibble: 5 x 3
## # Groups:   continent [1]
##   continent  year bypop_mean
##   <chr>     <int>      <dbl>
## 1 Africa     1952   4570010.
## 2 Africa     1957   5093033.
## 3 Africa     1962   5702247.
## 4 Africa     1967   6447875.
## 5 Africa     1972   7305376.

What is distribution of GDP per/cap per country?

ggplot(bypop,
       aes(x=bypop$year,
           y=bypop$bypop_mean, fill = bypop$continent
           ))+
  geom_bar(stat="identity", position="dodge")