##“Exercise 4.1”
library(readr)
winter_olympic <- read_csv("C:/Users/ywang/RStudio/R/winter_olympic.csv")
## Rows: 26 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NOC, Region
## dbl (5): Rank, Gold, Silver, Bronze, Total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(winter_olympic)
#what are the names of the colums?
names(winter_olympic)
## [1] "Rank" "NOC" "Gold" "Silver" "Bronze" "Total" "Region"
#how many contries(rows) are in the data frame
dim(winter_olympic)
## [1] 26 7
#attach data frame,sort by total medals and country
attach(winter_olympic)
sort_total <- winter_olympic[order(Total,NOC),]
sort_total
## # A tibble: 26 × 7
## Rank NOC Gold Silver Bronze Total Region
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 25 Croatia (CRO) 0 1 0 1 EUROPE
## 2 26 Kazakhstan (KAZ) 0 0 1 1 EURASIA
## 3 21 Slovakia (SVK) 1 0 0 1 EUROPE
## 4 20 Ukraine (UKR) 1 0 1 2 EURASIA
## 5 24 Australia (AUS) 0 2 1 3 AUSTRALIA
## 6 19 Great Britain (GBR) 1 1 2 4 EUROPE
## 7 23 Latvia (LAT) 0 2 2 4 EURASIA
## 8 18 Finland (FIN) 1 3 1 5 EUROPE
## 9 8 Belarus (BLR) 5 0 1 6 EURASIA
## 10 11 Poland (POL) 4 1 1 6 EUROPE
## # … with 16 more rows
#get summary statistics on data
library(Hmisc)
## 载入需要的程辑包:lattice
## 载入需要的程辑包:survival
## 载入需要的程辑包:Formula
## 载入需要的程辑包:ggplot2
##
## 载入程辑包:'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(winter_olympic)
## winter_olympic
##
## 7 Variables 26 Observations
## --------------------------------------------------------------------------------
## Rank
## n missing distinct Info Mean Gmd .05 .10
## 26 0 26 1 13.5 9 2.25 3.50
## .25 .50 .75 .90 .95
## 7.25 13.50 19.75 23.50 24.75
##
## lowest : 1 2 3 4 5, highest: 22 23 24 25 26
## --------------------------------------------------------------------------------
## NOC
## n missing distinct
## 26 0 26
##
## lowest : Australia (AUS) Austria (AUT) Belarus (BLR) Canada (CAN) China (CHN)
## highest: South Korea (KOR) Sweden (SWE) Switzerland (SUI) Ukraine (UKR) United States (USA)
## --------------------------------------------------------------------------------
## Gold
## n missing distinct Info Mean Gmd .05 .10
## 26 0 12 0.983 3.808 4.218 0.00 0.00
## .25 .50 .75 .90 .95
## 1.00 2.50 5.75 9.50 10.75
##
## lowest : 0 1 2 3 4, highest: 8 9 10 11 13
##
## Value 0 1 2 3 4 5 6 8 9 10 11
## Frequency 5 5 3 2 3 1 1 2 1 1 1
## Proportion 0.192 0.192 0.115 0.077 0.115 0.038 0.038 0.077 0.038 0.038 0.038
##
## Value 13
## Frequency 1
## Proportion 0.038
## --------------------------------------------------------------------------------
## Silver
## n missing distinct Info Mean Gmd .05 .10
## 26 0 11 0.986 3.731 3.511 0.00 0.00
## .25 .50 .75 .90 .95
## 1.25 3.00 5.75 7.50 9.50
##
## lowest : 0 1 2 3 4, highest: 6 7 8 10 11
##
## Value 0 1 2 3 4 5 6 7 8 10 11
## Frequency 4 3 4 3 4 1 1 3 1 1 1
## Proportion 0.154 0.115 0.154 0.115 0.154 0.038 0.038 0.115 0.038 0.038 0.038
## --------------------------------------------------------------------------------
## Bronze
## n missing distinct Info Mean Gmd .05 .10
## 26 0 11 0.974 3.808 3.683 0.25 1.00
## .25 .50 .75 .90 .95
## 1.00 2.00 5.75 9.00 9.75
##
## lowest : 0 1 2 3 4, highest: 6 7 9 10 12
##
## Value 0 1 2 3 4 5 6 7 9 10 12
## Frequency 2 6 6 1 1 3 2 1 2 1 1
## Proportion 0.077 0.231 0.231 0.038 0.038 0.115 0.077 0.038 0.077 0.038 0.038
## --------------------------------------------------------------------------------
## Total
## n missing distinct Info Mean Gmd .05 .10
## 26 0 17 0.991 11.35 10.35 1.00 1.50
## .25 .50 .75 .90 .95
## 4.25 8.00 16.50 25.50 27.50
##
## lowest : 1 2 3 4 5, highest: 24 25 26 28 33
##
## Value 1 2 3 4 5 6 8 9 11 15 17
## Frequency 3 1 1 2 1 2 5 1 1 2 1
## Proportion 0.115 0.038 0.038 0.077 0.038 0.077 0.192 0.038 0.038 0.077 0.038
##
## Value 19 24 25 26 28 33
## Frequency 1 1 1 1 1 1
## Proportion 0.038 0.038 0.038 0.038 0.038 0.038
## --------------------------------------------------------------------------------
## Region
## n missing distinct
## 26 0 5
##
## lowest : ASIA AUSTRALIA EURASIA EUROPE NORTH_A
## highest: ASIA AUSTRALIA EURASIA EUROPE NORTH_A
##
## Value ASIA AUSTRALIA EURASIA EUROPE NORTH_A
## Frequency 3 1 5 15 2
## Proportion 0.115 0.038 0.192 0.577 0.077
## --------------------------------------------------------------------------------
#What is median of number of gold, silver, bronze and total medals,Also look at the mean and total number of G, S, B and T medals?
IQR(winter_olympic$Gold)
## [1] 4.75
min(winter_olympic$Gold)
## [1] 0
max(winter_olympic$Gold)
## [1] 13
mean(winter_olympic$Gold)
## [1] 3.807692
var(winter_olympic$Gold)
## [1] 14.64154
sd(winter_olympic$Gold)
## [1] 3.826426
IQR(winter_olympic$Silver)
## [1] 4.5
min(winter_olympic$Silver)
## [1] 0
max(winter_olympic$Silver)
## [1] 11
mean(winter_olympic$Silver)
## [1] 3.730769
var(winter_olympic$Silver)
## [1] 9.644615
sd(winter_olympic$Silver)
## [1] 3.105578
IQR(winter_olympic$Bronze)
## [1] 4.75
min(winter_olympic$Bronze)
## [1] 0
max(winter_olympic$Bronze)
## [1] 12
mean(winter_olympic$Bronze)
## [1] 3.807692
var(winter_olympic$Bronze)
## [1] 11.20154
sd(winter_olympic$Bronze)
## [1] 3.34687
IQR(winter_olympic$Total)
## [1] 12.25
min(winter_olympic$Total)
## [1] 1
max(winter_olympic$Total)
## [1] 33
mean(winter_olympic$Total)
## [1] 11.34615
var(winter_olympic$Total)
## [1] 86.79538
sd(winter_olympic$Total)
## [1] 9.316404
library(psych)
##
## 载入程辑包:'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
skew(winter_olympic$Gold)
## [1] 0.87898
max(mean(winter_olympic$Total))
## [1] 11.34615
grouping(winter_olympic$Region)
## [1] 1 8 20 23 26 2 5 6 7 9 10 11 14 15 16 18 19 21 22 25 3 4 12 13 17
## [26] 24
## attr(,"ends")
## [1] 5 20 22 25 26
## attr(,"maxgrpn")
## [1] 15
## attr(,"class")
## [1] "grouping" "integer"
#How many countries are in this Geographic Region?
describeBy(winter_olympic$Total,group=winter_olympic$Region)
##
## Descriptive statistics by group
## group: ASIA
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 3 8.33 0.58 8 8.33 0 8 9 1 0.38 -2.33 0.33
## ------------------------------------------------------------
## group: AUSTRALIA
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1 3 NA 3 3 0 3 3 0 NA NA NA
## ------------------------------------------------------------
## group: EURASIA
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 5 9.2 13.44 4 9.2 2.97 1 33 32 1.02 -0.99 6.01
## ------------------------------------------------------------
## group: EUROPE
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15 11.2 7.85 8 10.85 10.38 1 26 25 0.43 -1.12 2.03
## ------------------------------------------------------------
## group: NORTH_A
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 26.5 2.12 26.5 26.5 2.22 25 28 3 0 -2.75 1.5
north_america <- subset(winter_olympic, Region == "NoRTH_A")
north_america
## # A tibble: 0 × 7
## # … with 7 variables: Rank <dbl>, NOC <chr>, Gold <dbl>, Silver <dbl>,
## # Bronze <dbl>, Total <dbl>, Region <chr>
europe <- subset(winter_olympic, Region== "EUROPE")
europe
## # A tibble: 15 × 7
## Rank NOC Gold Silver Bronze Total Region
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2 Norway (NOR) 11 5 10 26 EUROPE
## 2 5 Netherlands (NED) 8 7 9 24 EUROPE
## 3 6 Germany (GER) 8 6 5 19 EUROPE
## 4 7 Switzerland (SUI) 6 3 2 11 EUROPE
## 5 9 Austria (AUT) 4 8 5 17 EUROPE
## 6 10 France (FRA) 4 4 7 15 EUROPE
## 7 11 Poland (POL) 4 1 1 6 EUROPE
## 8 14 Sweden (SWE) 2 7 6 15 EUROPE
## 9 15 Czech Republic (CZE) 2 4 2 8 EUROPE
## 10 16 Slovenia (SLO) 2 2 4 8 EUROPE
## 11 18 Finland (FIN) 1 3 1 5 EUROPE
## 12 19 Great Britain (GBR) 1 1 2 4 EUROPE
## 13 21 Slovakia (SVK) 1 0 0 1 EUROPE
## 14 22 Italy (ITA) 0 2 6 8 EUROPE
## 15 25 Croatia (CRO) 0 1 0 1 EUROPE
austrilia <-subset(winter_olympic, Region=="AUSTRALIA")
austrilia
## # A tibble: 1 × 7
## Rank NOC Gold Silver Bronze Total Region
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 24 Australia (AUS) 0 2 1 3 AUSTRALIA
eurasia <-subset(winter_olympic, Region== "EURASIA")
eurasia
## # A tibble: 5 × 7
## Rank NOC Gold Silver Bronze Total Region
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 Russia (RUS)* 13 11 9 33 EURASIA
## 2 8 Belarus (BLR) 5 0 1 6 EURASIA
## 3 20 Ukraine (UKR) 1 0 1 2 EURASIA
## 4 23 Latvia (LAT) 0 2 2 4 EURASIA
## 5 26 Kazakhstan (KAZ) 0 0 1 1 EURASIA
asia <-subset(winter_olympic, Region=="ASIA")
asia
## # A tibble: 3 × 7
## Rank NOC Gold Silver Bronze Total Region
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 12 China (CHN) 3 4 2 9 ASIA
## 2 13 South Korea (KOR) 3 3 2 8 ASIA
## 3 17 Japan (JPN) 1 4 3 8 ASIA
#what is the max number of medals won? What country won the max?
max(winter_olympic$Total)
## [1] 33
subset(winter_olympic, winter_olympic$Total==33)
## # A tibble: 1 × 7
## Rank NOC Gold Silver Bronze Total Region
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 Russia (RUS)* 13 11 9 33 EURASIA
table(winter_olympic$Total, winter_olympic$Region)
##
## ASIA AUSTRALIA EURASIA EUROPE NORTH_A
## 1 0 0 1 2 0
## 2 0 0 1 0 0
## 3 0 1 0 0 0
## 4 0 0 1 1 0
## 5 0 0 0 1 0
## 6 0 0 1 1 0
## 8 2 0 0 3 0
## 9 1 0 0 0 0
## 11 0 0 0 1 0
## 15 0 0 0 2 0
## 17 0 0 0 1 0
## 19 0 0 0 1 0
## 24 0 0 0 1 0
## 25 0 0 0 0 1
## 26 0 0 0 1 0
## 28 0 0 0 0 1
## 33 0 0 1 0 0
#explore correlations between total medals and number of gold and bronze
cor(winter_olympic$Total,winter_olympic$Gold)
## [1] 0.9186698
cor(winter_olympic$Total, winter_olympic$Silver)
## [1] 0.8992183
cor(winter_olympic$Gold,winter_olympic$Silver)
## [1] 0.7427381
#what is the max number of medals won? what country won the max?
max(winter_olympic$Total)
## [1] 33
subset(winter_olympic, winter_olympic$Total==33)
## # A tibble: 1 × 7
## Rank NOC Gold Silver Bronze Total Region
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 Russia (RUS)* 13 11 9 33 EURASIA
table(winter_olympic$Total,winter_olympic$Region)
##
## ASIA AUSTRALIA EURASIA EUROPE NORTH_A
## 1 0 0 1 2 0
## 2 0 0 1 0 0
## 3 0 1 0 0 0
## 4 0 0 1 1 0
## 5 0 0 0 1 0
## 6 0 0 1 1 0
## 8 2 0 0 3 0
## 9 1 0 0 0 0
## 11 0 0 0 1 0
## 15 0 0 0 2 0
## 17 0 0 0 1 0
## 19 0 0 0 1 0
## 24 0 0 0 1 0
## 25 0 0 0 0 1
## 26 0 0 0 1 0
## 28 0 0 0 0 1
## 33 0 0 1 0 0
#what is the correlation between rank and total mmedals? is this expected or surprsing?
cor(winter_olympic$Rank,winter_olympic$Total)
## [1] -0.874864
##Exercise 4.2
library(readr)
gdp <- read_csv("C:/Users/ywang/RStudio/R/gdp.csv")
## Rows: 264 Columns: 60
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country Name, Country Code
## dbl (58): 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(gdp)
gdp$"2017" <-gdp$"2017" / 1000000000
mean(gdp$"2017", na.rm = TRUE)
## [1] 2848.129
median(gdp$"2017",na.rm = TRUE)
## [1] 59.44777
range(gdp$"2017", na.rm = TRUE)
## [1] 3.973132e-02 8.068379e+04
quantile(gdp$"2017", na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 3.973132e-02 1.150900e+01 5.944777e+01 7.175373e+02 8.068379e+04