1. Import libraries
library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('scales')
## Warning: package 'scales' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
R Markdown
dplyr functions:
select(): subset columns
filter(): subset rows on conditions
mutate(): create new columns by using information from other columns
group_by() and summarize(): create summary statisitcs on grouped data
arrange(): sort results
count(): count discrete values
tidyr functions:
spread(): opposite of gathering.You use it when an observation is scattered across multiple rows.
gather(): collects a set of column names and places them into a single "key" column.
turn columns into rows (gather()), turn rows into columns (spread()), turn a character column into multiple columns (separate()), turn multiple character columns into a single column (unite())
Retrieve Data File for Dataset 1.
dataset0 <- read.csv(file="https://raw.githubusercontent.com/AjayArora35/Data-607-Project-2/master/combined.csv", header=TRUE)
head(dataset0)
## record_id month day year plot_id species_id sex hindfoot_length weight
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL NA NA
## 4 266 10 16 1977 2 NL NA NA
## 5 349 11 12 1977 2 NL NA NA
## 6 363 11 12 1977 2 NL NA NA
## genus species taxa plot_type
## 1 Neotoma albigula Rodent Control
## 2 Neotoma albigula Rodent Control
## 3 Neotoma albigula Rodent Control
## 4 Neotoma albigula Rodent Control
## 5 Neotoma albigula Rodent Control
## 6 Neotoma albigula Rodent Control
Data Clean – Get rid of NAs and any blank data
dataset1 <- dataset0 %>% filter(!is.na(hindfoot_length)) %>% filter(!is.na(sex)) %>% filter(!is.na(weight)) %>% filter(!(sex == ''))
Analysis of various aspects of animal data
What is the total number of data elements for each sex?
res1 <- dataset1 %>% count(sex)
#For debugging res1
ggplot(res1,
aes(x=res1$n,
y=res1$sex, fill = res1$sex
))+
geom_bar(stat="identity", position="dodge")
##What is the Alphabetical order of all species?
dataset1 %>% count(sex, species) %>% arrange(species, desc(n)) %>% head(5) %>% kable() %>% kable_styling()
|
sex
|
species
|
n
|
|
F
|
albigula
|
606
|
|
M
|
albigula
|
439
|
|
F
|
baileyi
|
1617
|
|
M
|
baileyi
|
1186
|
|
M
|
eremicus
|
659
|
What is the min, mean and max for all species?
dataset1 %>% group_by(species_id) %>%
summarize(
mean_hindfoot_length = mean(hindfoot_length),
min_hindfoot_length = min(hindfoot_length),
max_hindfoot_length = max(hindfoot_length),
n = n()
) %>% head(5) %>% kable() %>% kable_styling()
|
species_id
|
mean_hindfoot_length
|
min_hindfoot_length
|
max_hindfoot_length
|
n
|
|
BA
|
13.00000
|
6
|
16
|
45
|
|
DM
|
35.99054
|
16
|
50
|
9727
|
|
DO
|
35.58781
|
26
|
64
|
2790
|
|
DS
|
49.99259
|
39
|
58
|
2023
|
|
NL
|
32.24880
|
21
|
42
|
1045
|
What is the heaviest animal per year?
dataset1 %>% group_by(year) %>% filter(weight == max(weight)) %>% select(year, genus, species, weight) %>% arrange(year) %>% head(5) %>% kable() %>% kable_styling()
|
year
|
genus
|
species
|
weight
|
|
1977
|
Dipodomys
|
spectabilis
|
149
|
|
1978
|
Neotoma
|
albigula
|
232
|
|
1978
|
Neotoma
|
albigula
|
232
|
|
1979
|
Neotoma
|
albigula
|
274
|
|
1980
|
Neotoma
|
albigula
|
241
|
What is the animal weight in killograms and pounds?
dataset1 %>% mutate(weight_kg = weight / 1000, weight_lb = weight_kg * 2.2) %>% head(5) %>% kable() %>% kable_styling()
|
record_id
|
month
|
day
|
year
|
plot_id
|
species_id
|
sex
|
hindfoot_length
|
weight
|
genus
|
species
|
taxa
|
plot_type
|
weight_kg
|
weight_lb
|
|
845
|
5
|
6
|
1978
|
2
|
NL
|
M
|
32
|
204
|
Neotoma
|
albigula
|
Rodent
|
Control
|
0.204
|
0.4488
|
|
1164
|
8
|
5
|
1978
|
2
|
NL
|
M
|
34
|
199
|
Neotoma
|
albigula
|
Rodent
|
Control
|
0.199
|
0.4378
|
|
1261
|
9
|
4
|
1978
|
2
|
NL
|
M
|
32
|
197
|
Neotoma
|
albigula
|
Rodent
|
Control
|
0.197
|
0.4334
|
|
1756
|
4
|
29
|
1979
|
2
|
NL
|
M
|
33
|
166
|
Neotoma
|
albigula
|
Rodent
|
Control
|
0.166
|
0.3652
|
|
1818
|
5
|
30
|
1979
|
2
|
NL
|
M
|
32
|
184
|
Neotoma
|
albigula
|
Rodent
|
Control
|
0.184
|
0.4048
|
What is the average weight by sex and species id?
res2 <- dataset1 %>% group_by(sex, species_id) %>% summarize(mean_weight = mean(weight, na.rm = TRUE))
ggplot(res2,
aes(x=res2$species_id,
y=res2$mean_weight, fill = res2$sex
))+
geom_bar(stat="identity", position="dodge")

What is the sorted list of data by mean_weight in descending order?
dataset1 %>% group_by(sex, species_id) %>% summarize(mean_weight = mean(weight), min_weight = min(weight)) %>% arrange(desc(mean_weight)) %>% head(5) %>% kable() %>% kable_styling()
|
sex
|
species_id
|
mean_weight
|
min_weight
|
|
M
|
NL
|
164.87927
|
30
|
|
F
|
NL
|
154.36964
|
32
|
|
M
|
DS
|
122.88410
|
12
|
|
F
|
DS
|
117.39224
|
45
|
|
F
|
SH
|
79.50704
|
30
|
What is mean_weight by genus?
dataset1_01 <- dataset1 %>% group_by(genus, plot_id) %>% summarize(mean_weight = mean(weight))
dataset1_01 %>% head(5) %>% kable() %>% kable_styling()
|
genus
|
plot_id
|
mean_weight
|
|
Baiomys
|
1
|
7.000000
|
|
Baiomys
|
2
|
6.000000
|
|
Baiomys
|
3
|
8.611111
|
|
Baiomys
|
5
|
7.750000
|
|
Baiomys
|
18
|
9.500000
|
What is the mean_weight by plot_id and genus?
dataset1_01_spread <- dataset1_01 %>% spread(key = genus, value = mean_weight, fill=0)
dataset1_01_spread %>% head(5) %>% kable() %>% kable_styling()
|
plot_id
|
Baiomys
|
Chaetodipus
|
Dipodomys
|
Neotoma
|
Onychomys
|
Perognathus
|
Peromyscus
|
Reithrodontomys
|
Sigmodon
|
|
1
|
7.000000
|
22.35312
|
59.41196
|
157.2200
|
27.63194
|
9.709677
|
22.15385
|
11.37500
|
0.00000
|
|
2
|
6.000000
|
25.15205
|
55.24145
|
166.6391
|
26.62917
|
7.071429
|
22.15698
|
10.63235
|
65.77778
|
|
3
|
8.611111
|
24.73460
|
51.56915
|
159.7439
|
26.02381
|
7.504348
|
21.44762
|
10.53769
|
66.50000
|
|
4
|
0.000000
|
22.90909
|
56.41931
|
159.6000
|
27.55833
|
7.943548
|
22.60000
|
10.27778
|
82.00000
|
|
5
|
7.750000
|
17.96364
|
50.87455
|
191.6250
|
26.86127
|
8.743590
|
21.16049
|
11.12381
|
82.66667
|
What is mean_weight by plot_id?
dataset1_02_gather <- dataset1_01_spread %>% gather(key = genus, value = mean_weight, -plot_id)
dataset1_02_gather %>% head(5) %>% kable() %>% kable_styling()
|
plot_id
|
genus
|
mean_weight
|
|
1
|
Baiomys
|
7.000000
|
|
2
|
Baiomys
|
6.000000
|
|
3
|
Baiomys
|
8.611111
|
|
4
|
Baiomys
|
0.000000
|
|
5
|
Baiomys
|
7.750000
|
Dataset 2 – Steel Imports Data
Reference: https://www.census.gov
https://www.census.gov/foreign-trade/Press-Release/2018pr/03/steel/steel2bf.xls
Retrieve Data File for Dataset 2.
#https://stackoverflow.com/questions/39110755/skip-specific-rows-using-read-csv-in-r
dataset2 <- read.csv(file="https://raw.githubusercontent.com/AjayArora35/Data-607-Project-2/master/steel2bf (2).csv", header=FALSE, stringsAsFactors = FALSE, skip=15)
head(dataset2,5)%>% kable() %>% kable_styling()
|
V1
|
V2
|
V3
|
V4
|
V5
|
V6
|
V7
|
V8
|
V9
|
V10
|
V11
|
V12
|
V13
|
V14
|
V15
|
V16
|
V17
|
V18
|
V19
|
V20
|
V21
|
V22
|
V23
|
V24
|
V25
|
V26
|
V27
|
V28
|
V29
|
V30
|
V31
|
V32
|
V33
|
V34
|
V35
|
V36
|
V37
|
V38
|
V39
|
V40
|
V41
|
V42
|
V43
|
V44
|
V45
|
V46
|
V47
|
V48
|
V49
|
V50
|
V51
|
V52
|
V53
|
V54
|
V55
|
V56
|
V57
|
V58
|
V59
|
V60
|
V61
|
V62
|
V63
|
V64
|
V65
|
V66
|
V67
|
V68
|
V69
|
V70
|
V71
|
V72
|
V73
|
V74
|
V75
|
V76
|
V77
|
V78
|
V79
|
V80
|
V81
|
V82
|
V83
|
V84
|
V85
|
V86
|
V87
|
V88
|
V89
|
V90
|
V91
|
V92
|
V93
|
V94
|
V95
|
V96
|
V97
|
V98
|
V99
|
V100
|
V101
|
V102
|
V103
|
V104
|
V105
|
V106
|
V107
|
V108
|
V109
|
V110
|
V111
|
V112
|
V113
|
V114
|
V115
|
V116
|
V117
|
V118
|
V119
|
V120
|
V121
|
V122
|
V123
|
V124
|
V125
|
V126
|
V127
|
V128
|
V129
|
V130
|
V131
|
V132
|
V133
|
V134
|
V135
|
V136
|
V137
|
V138
|
V139
|
V140
|
V141
|
V142
|
V143
|
V144
|
V145
|
V146
|
V147
|
V148
|
V149
|
V150
|
V151
|
V152
|
V153
|
V154
|
V155
|
V156
|
V157
|
V158
|
V159
|
V160
|
V161
|
V162
|
V163
|
V164
|
V165
|
V166
|
V167
|
V168
|
V169
|
V170
|
V171
|
V172
|
V173
|
V174
|
V175
|
V176
|
V177
|
V178
|
V179
|
V180
|
V181
|
V182
|
V183
|
V184
|
V185
|
V186
|
V187
|
V188
|
V189
|
V190
|
V191
|
V192
|
V193
|
V194
|
V195
|
V196
|
V197
|
V198
|
V199
|
V200
|
V201
|
V202
|
V203
|
V204
|
V205
|
V206
|
V207
|
V208
|
V209
|
V210
|
V211
|
V212
|
V213
|
V214
|
V215
|
V216
|
V217
|
V218
|
V219
|
V220
|
V221
|
V222
|
V223
|
V224
|
V225
|
V226
|
V227
|
V228
|
V229
|
V230
|
V231
|
V232
|
V233
|
V234
|
|
North America
|
11,284
|
35,667
|
8,652
|
29,088
|
9,599
|
29,928
|
10,007
|
28,129
|
29,862
|
97,002
|
28,825
|
86,937
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Canada
|
4,265
|
22,062
|
4,018
|
19,742
|
3,560
|
18,850
|
2,795
|
15,745
|
12,541
|
62,596
|
8,993
|
51,388
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Mexico
|
7,019
|
13,604
|
4,634
|
9,346
|
6,039
|
11,078
|
7,212
|
12,383
|
17,321
|
34,406
|
19,832
|
35,549
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
Europe
|
47,170
|
183,656
|
40,246
|
156,209
|
41,862
|
164,523
|
33,255
|
124,861
|
129,763
|
497,867
|
110,278
|
416,014
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
|
European Union
|
45,085
|
177,054
|
38,807
|
150,295
|
39,659
|
156,743
|
30,980
|
117,190
|
122,900
|
477,364
|
104,035
|
391,818
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA
|
Data Clean
Get the first 47 observations X 13 variables
#https://stats.idre.ucla.edu/r/modules/subsetting-data/
dataset2 <- dataset2[1:47,1:13]
head(dataset2,5)%>% kable() %>% kable_styling()
|
V1
|
V2
|
V3
|
V4
|
V5
|
V6
|
V7
|
V8
|
V9
|
V10
|
V11
|
V12
|
V13
|
|
North America
|
11,284
|
35,667
|
8,652
|
29,088
|
9,599
|
29,928
|
10,007
|
28,129
|
29,862
|
97,002
|
28,825
|
86,937
|
|
Canada
|
4,265
|
22,062
|
4,018
|
19,742
|
3,560
|
18,850
|
2,795
|
15,745
|
12,541
|
62,596
|
8,993
|
51,388
|
|
Mexico
|
7,019
|
13,604
|
4,634
|
9,346
|
6,039
|
11,078
|
7,212
|
12,383
|
17,321
|
34,406
|
19,832
|
35,549
|
|
Europe
|
47,170
|
183,656
|
40,246
|
156,209
|
41,862
|
164,523
|
33,255
|
124,861
|
129,763
|
497,867
|
110,278
|
416,014
|
|
European Union
|
45,085
|
177,054
|
38,807
|
150,295
|
39,659
|
156,743
|
30,980
|
117,190
|
122,900
|
477,364
|
104,035
|
391,818
|
Get rid of (-) present in the data; replace with NA
dataset2[dataset2 =='(-)'] <- NA
head(dataset2,5)%>% kable() %>% kable_styling()
|
V1
|
V2
|
V3
|
V4
|
V5
|
V6
|
V7
|
V8
|
V9
|
V10
|
V11
|
V12
|
V13
|
|
North America
|
11,284
|
35,667
|
8,652
|
29,088
|
9,599
|
29,928
|
10,007
|
28,129
|
29,862
|
97,002
|
28,825
|
86,937
|
|
Canada
|
4,265
|
22,062
|
4,018
|
19,742
|
3,560
|
18,850
|
2,795
|
15,745
|
12,541
|
62,596
|
8,993
|
51,388
|
|
Mexico
|
7,019
|
13,604
|
4,634
|
9,346
|
6,039
|
11,078
|
7,212
|
12,383
|
17,321
|
34,406
|
19,832
|
35,549
|
|
Europe
|
47,170
|
183,656
|
40,246
|
156,209
|
41,862
|
164,523
|
33,255
|
124,861
|
129,763
|
497,867
|
110,278
|
416,014
|
|
European Union
|
45,085
|
177,054
|
38,807
|
150,295
|
39,659
|
156,743
|
30,980
|
117,190
|
122,900
|
477,364
|
104,035
|
391,818
|
Give names to columns
names(dataset2) <- c('Countries','032018_quantity','032018_value','022018_quantity','022018_value',
'032017_quantity','032017_value', '022017_quantity','022017_value',
'03122018_quantity','03122018_value', '03122017_quantity','03122017_value')
head(dataset2,5)%>% kable() %>% kable_styling()
|
Countries
|
032018_quantity
|
032018_value
|
022018_quantity
|
022018_value
|
032017_quantity
|
032017_value
|
022017_quantity
|
022017_value
|
03122018_quantity
|
03122018_value
|
03122017_quantity
|
03122017_value
|
|
North America
|
11,284
|
35,667
|
8,652
|
29,088
|
9,599
|
29,928
|
10,007
|
28,129
|
29,862
|
97,002
|
28,825
|
86,937
|
|
Canada
|
4,265
|
22,062
|
4,018
|
19,742
|
3,560
|
18,850
|
2,795
|
15,745
|
12,541
|
62,596
|
8,993
|
51,388
|
|
Mexico
|
7,019
|
13,604
|
4,634
|
9,346
|
6,039
|
11,078
|
7,212
|
12,383
|
17,321
|
34,406
|
19,832
|
35,549
|
|
Europe
|
47,170
|
183,656
|
40,246
|
156,209
|
41,862
|
164,523
|
33,255
|
124,861
|
129,763
|
497,867
|
110,278
|
416,014
|
|
European Union
|
45,085
|
177,054
|
38,807
|
150,295
|
39,659
|
156,743
|
30,980
|
117,190
|
122,900
|
477,364
|
104,035
|
391,818
|
Get rid of all the commas in the numeric values and convert to numeric data type
dataset2$`032018_quantity` <- as.numeric(gsub(",","",dataset2$`032018_quantity`))
dataset2$`032018_value` <- as.numeric(gsub(",","",dataset2$`032018_value`))
dataset2$`022018_quantity` <- as.numeric(gsub(",","",dataset2$`022018_quantity`))
dataset2$`022018_value` <- as.numeric(gsub(",","",dataset2$`022018_value`))
dataset2$`032017_quantity` <- as.numeric(gsub(",","",dataset2$`032017_quantity`))
dataset2$`032017_value` <- as.numeric(gsub(",","",dataset2$`032017_value`))
dataset2$`022017_quantity` <- as.numeric(gsub(",","",dataset2$`022017_quantity`))
dataset2$`022017_value` <- as.numeric(gsub(",","",dataset2$`022017_value`))
dataset2$`03122018_quantity` <- as.numeric(gsub(",","",dataset2$`03122018_quantity`))
dataset2$`03122018_value` <- as.numeric(gsub(",","",dataset2$`03122018_value`))
dataset2$`03122017_quantity` <- as.numeric(gsub(",","",dataset2$`03122017_quantity`))
dataset2$`03122017_value` <- as.numeric(gsub(",","",dataset2$`03122017_value`))
#head(dataset2)
#https://stackoverflow.com/questions/45576805/how-to-replace-all-na-in-a-dataframe-using-tidyrreplace-na
tempList <- setNames(lapply(vector("list", ncol(dataset2)), function(x) x <- 0), names(dataset2))
# Now use that list in tidyr::replace_na
dataset2 <- dataset2 %>% replace_na(tempList)
head(dataset2, 5)%>% kable() %>% kable_styling()
|
Countries
|
032018_quantity
|
032018_value
|
022018_quantity
|
022018_value
|
032017_quantity
|
032017_value
|
022017_quantity
|
022017_value
|
03122018_quantity
|
03122018_value
|
03122017_quantity
|
03122017_value
|
|
North America
|
11284
|
35667
|
8652
|
29088
|
9599
|
29928
|
10007
|
28129
|
29862
|
97002
|
28825
|
86937
|
|
Canada
|
4265
|
22062
|
4018
|
19742
|
3560
|
18850
|
2795
|
15745
|
12541
|
62596
|
8993
|
51388
|
|
Mexico
|
7019
|
13604
|
4634
|
9346
|
6039
|
11078
|
7212
|
12383
|
17321
|
34406
|
19832
|
35549
|
|
Europe
|
47170
|
183656
|
40246
|
156209
|
41862
|
164523
|
33255
|
124861
|
129763
|
497867
|
110278
|
416014
|
|
European Union
|
45085
|
177054
|
38807
|
150295
|
39659
|
156743
|
30980
|
117190
|
122900
|
477364
|
104035
|
391818
|
Data Analysis
dataset2_final = gather(dataset2,month_year_type, Number,2:13, factor_key = TRUE) %>% separate(month_year_type, c('month_year','quant_value'),sep='_') %>%
spread(month_year,'Number', convert=TRUE)
head(dataset2_final, 5)%>% kable() %>% kable_styling()
|
Countries
|
quant_value
|
022017
|
022018
|
03122017
|
03122018
|
032017
|
032018
|
|
Argentina
|
quantity
|
0
|
3
|
0
|
3
|
0
|
0
|
|
Argentina
|
value
|
0
|
14
|
0
|
14
|
0
|
0
|
|
Australia
|
quantity
|
15
|
56
|
52
|
97
|
0
|
20
|
|
Australia
|
value
|
149
|
576
|
556
|
1008
|
6
|
206
|
|
Austria
|
quantity
|
600
|
1404
|
2261
|
4438
|
1095
|
1630
|
dataset2_final$`032018` <- as.numeric(gsub(",","",dataset2_final$`032018`))
dataset2_final$`022017` <- as.numeric(gsub(",","",dataset2_final$`022017`))
dataset2_final$`022018` <- as.numeric(gsub(",","",dataset2_final$`022018`))
dataset2_final$`03122017` <- as.numeric(gsub(",","",dataset2_final$`03122017`))
dataset2_final$`03122018` <- as.numeric(gsub(",","",dataset2_final$`03122018`))
dataset2_final$`032017` <- as.numeric(gsub(",","",dataset2_final$`032017`))
final_quantity <- dataset2_final %>% filter(quant_value =='quantity') %>% rowwise() %>% mutate(totalquantity = sum(`032018`,`03122018`,`032017`, `03122017`, na.rm=TRUE)) %>% arrange(desc(totalquantity))
head(final_quantity, 5)%>% kable() %>% kable_styling()
|
Countries
|
quant_value
|
022017
|
022018
|
03122017
|
03122018
|
032017
|
032018
|
totalquantity
|
|
Europe
|
quantity
|
33255
|
40246
|
110278
|
129763
|
41862
|
47170
|
329073
|
|
European Union
|
quantity
|
30980
|
38807
|
104035
|
122900
|
39659
|
45085
|
311679
|
|
Euro Area
|
quantity
|
22224
|
31129
|
78507
|
94874
|
29769
|
34848
|
237998
|
|
Pacific Rim Countries
|
quantity
|
22381
|
45401
|
71704
|
102824
|
25457
|
28455
|
228440
|
|
Other Countries
|
quantity
|
10822
|
9097
|
33141
|
30908
|
12567
|
11108
|
87724
|
What is the distribution of Countries and Quantities?
ggplot(final_quantity,
aes(x=final_quantity$totalquantity,
y=final_quantity$Countries, fill = final_quantity$totalquantity
))+
geom_point()

What is the distribution of Countries and Values?
final_value <- dataset2_final %>% filter(quant_value =='value') %>% rowwise() %>% mutate(totalvalue = sum(`032018`,`03122018`,`032017`, `03122017`, na.rm=TRUE)) %>% arrange(desc(totalvalue))
head(final_value, 5)%>% kable() %>% kable_styling()
|
Countries
|
quant_value
|
022017
|
022018
|
03122017
|
03122018
|
032017
|
032018
|
totalvalue
|
|
Europe
|
value
|
124861
|
156209
|
416014
|
497867
|
164523
|
183656
|
1262060
|
|
European Union
|
value
|
117190
|
150295
|
391818
|
477364
|
156743
|
177054
|
1202979
|
|
Euro Area
|
value
|
88961
|
120555
|
301994
|
370970
|
117668
|
137157
|
927789
|
|
Pacific Rim Countries
|
value
|
81213
|
137556
|
262328
|
350504
|
95804
|
108106
|
816742
|
|
Germany
|
value
|
36296
|
39553
|
117740
|
126494
|
45631
|
45670
|
335535
|
ggplot(final_value,
aes(x=final_value$totalvalue,
y=final_value$Countries, fill = final_value$totalvalue
))+
geom_point()

What is the distribution ratio of Countries?
final_ratio <- final_quantity$totalquantity/final_value$totalvalue
#For debugging final_ratio
ggplot(final_value,
aes(x=final_ratio,
y=final_value$Countries
))+
geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).

Retrieve Data File for Dataset 3.
dataset3 <- read.csv(file="https://raw.githubusercontent.com/AjayArora35/Data-607-Project-2/master/gapminder_wide (1).csv", header=TRUE, stringsAsFactors = FALSE)
head(dataset3,5)
## continent country gdpPercap_1952 gdpPercap_1957 gdpPercap_1962
## 1 Africa Algeria 2449.0082 3013.9760 2550.8169
## 2 Africa Angola 3520.6103 3827.9405 4269.2767
## 3 Africa Benin 1062.7522 959.6011 949.4991
## 4 Africa Botswana 851.2411 918.2325 983.6540
## 5 Africa Burkina Faso 543.2552 617.1835 722.5120
## gdpPercap_1967 gdpPercap_1972 gdpPercap_1977 gdpPercap_1982
## 1 3246.9918 4182.664 4910.417 5745.1602
## 2 5522.7764 5473.288 3008.647 2756.9537
## 3 1035.8314 1085.797 1029.161 1277.8976
## 4 1214.7093 2263.611 3214.858 4551.1421
## 5 794.8266 854.736 743.387 807.1986
## gdpPercap_1987 gdpPercap_1992 gdpPercap_1997 gdpPercap_2002
## 1 5681.3585 5023.2166 4797.295 5288.040
## 2 2430.2083 2627.8457 2277.141 2773.287
## 3 1225.8560 1191.2077 1232.975 1372.878
## 4 6205.8839 7954.1116 8647.142 11003.605
## 5 912.0631 931.7528 946.295 1037.645
## gdpPercap_2007 lifeExp_1952 lifeExp_1957 lifeExp_1962 lifeExp_1967
## 1 6223.367 43.077 45.685 48.303 51.407
## 2 4797.231 30.015 31.999 34.000 35.985
## 3 1441.285 38.223 40.358 42.618 44.885
## 4 12569.852 47.622 49.618 51.520 53.298
## 5 1217.033 31.975 34.906 37.814 40.697
## lifeExp_1972 lifeExp_1977 lifeExp_1982 lifeExp_1987 lifeExp_1992
## 1 54.518 58.014 61.368 65.799 67.744
## 2 37.928 39.483 39.942 39.906 40.647
## 3 47.014 49.190 50.904 52.337 53.919
## 4 56.024 59.319 61.484 63.622 62.745
## 5 43.591 46.137 48.122 49.557 50.260
## lifeExp_1997 lifeExp_2002 lifeExp_2007 pop_1952 pop_1957 pop_1962
## 1 69.152 70.994 72.301 9279525 10270856 11000948
## 2 40.963 41.003 42.731 4232095 4561361 4826015
## 3 54.777 54.406 56.728 1738315 1925173 2151895
## 4 52.556 46.634 50.728 442308 474639 512764
## 5 50.324 50.650 52.295 4469979 4713416 4919632
## pop_1967 pop_1972 pop_1977 pop_1982 pop_1987 pop_1992 pop_1997 pop_2002
## 1 12760499 14760787 17152804 20033753 23254956 26298373 29072015 31287142
## 2 5247469 5894858 6162675 7016384 7874230 8735988 9875024 10866106
## 3 2427334 2761407 3168267 3641603 4243788 4981671 6066080 7026113
## 4 553541 619351 781472 970347 1151184 1342614 1536536 1630347
## 5 5127935 5433886 5889574 6634596 7586551 8878303 10352843 12251209
## pop_2007
## 1 33333216
## 2 12420476
## 3 8078314
## 4 1639131
## 5 14326203
Data Analysis
dataset3_long <- dataset3 %>%
gather(key = obstype_year,
value = values,
-continent, -country) %>%
separate(obstype_year,
into = c('type','year'),
sep = "_",
convert = TRUE) #this ensures that the year column is an integer rather than a character
head(dataset3_long,5)
## continent country type year values
## 1 Africa Algeria gdpPercap 1952 2449.0082
## 2 Africa Angola gdpPercap 1952 3520.6103
## 3 Africa Benin gdpPercap 1952 1062.7522
## 4 Africa Botswana gdpPercap 1952 851.2411
## 5 Africa Burkina Faso gdpPercap 1952 543.2552
What is distribution of GDP per/cap per country?
ggplot(dataset3_long,
aes(x=dataset3_long$values,
y=dataset3_long$country
))+
geom_point()

What is the distribution of GDP per/year per continent?
bycontinents <- dataset3_long %>% group_by(continent, year) %>% summarize(bycontinents_mean = mean(values))
head(bycontinents, 5)
## # A tibble: 5 x 3
## # Groups: continent [1]
## continent year bycontinents_mean
## <chr> <int> <dbl>
## 1 Africa 1952 1523767.
## 2 Africa 1957 1698153.
## 3 Africa 1962 1901296.
## 4 Africa 1967 2149990.
## 5 Africa 1972 2435921.
ggplot(bycontinents,
aes(x=bycontinents$year,
y=bycontinents$bycontinents_mean, fill = bycontinents$continent
))+
geom_bar(stat="identity", position="dodge")

What is the distribution of Life Expectancy per year/per continent?
bylifeexp <- dataset3_long %>% filter(type == "lifeExp") %>% group_by(continent, year) %>% summarize(bylifeexp_mean = mean(values))
head(bylifeexp,5)
## # A tibble: 5 x 3
## # Groups: continent [1]
## continent year bylifeexp_mean
## <chr> <int> <dbl>
## 1 Africa 1952 39.1
## 2 Africa 1957 41.3
## 3 Africa 1962 43.3
## 4 Africa 1967 45.3
## 5 Africa 1972 47.5
What is distribution of GDP per/cap per country?
ggplot(bylifeexp,
aes(x=bylifeexp$year,
y=bylifeexp$bylifeexp_mean, fill = bylifeexp$continent
))+
geom_bar(stat="identity", position="dodge")

What is the distribution of Population per year/per continent?
bypop <- dataset3_long %>% filter(type == "pop") %>% group_by(continent, year) %>% summarize(bypop_mean = mean(values))
head(bypop,5)
## # A tibble: 5 x 3
## # Groups: continent [1]
## continent year bypop_mean
## <chr> <int> <dbl>
## 1 Africa 1952 4570010.
## 2 Africa 1957 5093033.
## 3 Africa 1962 5702247.
## 4 Africa 1967 6447875.
## 5 Africa 1972 7305376.
What is distribution of GDP per/cap per country?
ggplot(bypop,
aes(x=bypop$year,
y=bypop$bypop_mean, fill = bypop$continent
))+
geom_bar(stat="identity", position="dodge")
