##“Exercise 4.1”

library(readr)
winter_olympic <- read_csv("C:/Users/ywang/RStudio/R/winter_olympic.csv")
## Rows: 26 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NOC, Region
## dbl (5): Rank, Gold, Silver, Bronze, Total
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(winter_olympic)

#what are the names of the colums?

names(winter_olympic)
## [1] "Rank"   "NOC"    "Gold"   "Silver" "Bronze" "Total"  "Region"

#how many contries(rows) are in the data frame

dim(winter_olympic)
## [1] 26  7

#attach data frame,sort by total medals and country

attach(winter_olympic)
sort_total <- winter_olympic[order(Total,NOC),]
sort_total
## # A tibble: 26 × 7
##     Rank NOC                   Gold Silver Bronze Total Region   
##    <dbl> <chr>                <dbl>  <dbl>  <dbl> <dbl> <chr>    
##  1    25  Croatia (CRO)           0      1      0     1 EUROPE   
##  2    26  Kazakhstan (KAZ)        0      0      1     1 EURASIA  
##  3    21  Slovakia (SVK)          1      0      0     1 EUROPE   
##  4    20  Ukraine (UKR)           1      0      1     2 EURASIA  
##  5    24  Australia (AUS)         0      2      1     3 AUSTRALIA
##  6    19  Great Britain (GBR)     1      1      2     4 EUROPE   
##  7    23  Latvia (LAT)            0      2      2     4 EURASIA  
##  8    18  Finland (FIN)           1      3      1     5 EUROPE   
##  9     8  Belarus (BLR)           5      0      1     6 EURASIA  
## 10    11  Poland (POL)            4      1      1     6 EUROPE   
## # … with 16 more rows

#get summary statistics on data

library(Hmisc)
## 载入需要的程辑包:lattice
## 载入需要的程辑包:survival
## 载入需要的程辑包:Formula
## 载入需要的程辑包:ggplot2
## 
## 载入程辑包:'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(winter_olympic)
## winter_olympic 
## 
##  7  Variables      26  Observations
## --------------------------------------------------------------------------------
## Rank 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        0       26        1     13.5        9     2.25     3.50 
##      .25      .50      .75      .90      .95 
##     7.25    13.50    19.75    23.50    24.75 
## 
## lowest :  1  2  3  4  5, highest: 22 23 24 25 26
## --------------------------------------------------------------------------------
## NOC 
##        n  missing distinct 
##       26        0       26 
## 
## lowest :  Australia (AUS)      Austria (AUT)        Belarus (BLR)        Canada (CAN)         China (CHN)        
## highest:  South Korea (KOR)    Sweden (SWE)         Switzerland (SUI)    Ukraine (UKR)        United States (USA)
## --------------------------------------------------------------------------------
## Gold 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        0       12    0.983    3.808    4.218     0.00     0.00 
##      .25      .50      .75      .90      .95 
##     1.00     2.50     5.75     9.50    10.75 
## 
## lowest :  0  1  2  3  4, highest:  8  9 10 11 13
##                                                                             
## Value          0     1     2     3     4     5     6     8     9    10    11
## Frequency      5     5     3     2     3     1     1     2     1     1     1
## Proportion 0.192 0.192 0.115 0.077 0.115 0.038 0.038 0.077 0.038 0.038 0.038
##                 
## Value         13
## Frequency      1
## Proportion 0.038
## --------------------------------------------------------------------------------
## Silver 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        0       11    0.986    3.731    3.511     0.00     0.00 
##      .25      .50      .75      .90      .95 
##     1.25     3.00     5.75     7.50     9.50 
## 
## lowest :  0  1  2  3  4, highest:  6  7  8 10 11
##                                                                             
## Value          0     1     2     3     4     5     6     7     8    10    11
## Frequency      4     3     4     3     4     1     1     3     1     1     1
## Proportion 0.154 0.115 0.154 0.115 0.154 0.038 0.038 0.115 0.038 0.038 0.038
## --------------------------------------------------------------------------------
## Bronze 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        0       11    0.974    3.808    3.683     0.25     1.00 
##      .25      .50      .75      .90      .95 
##     1.00     2.00     5.75     9.00     9.75 
## 
## lowest :  0  1  2  3  4, highest:  6  7  9 10 12
##                                                                             
## Value          0     1     2     3     4     5     6     7     9    10    12
## Frequency      2     6     6     1     1     3     2     1     2     1     1
## Proportion 0.077 0.231 0.231 0.038 0.038 0.115 0.077 0.038 0.077 0.038 0.038
## --------------------------------------------------------------------------------
## Total 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       26        0       17    0.991    11.35    10.35     1.00     1.50 
##      .25      .50      .75      .90      .95 
##     4.25     8.00    16.50    25.50    27.50 
## 
## lowest :  1  2  3  4  5, highest: 24 25 26 28 33
##                                                                             
## Value          1     2     3     4     5     6     8     9    11    15    17
## Frequency      3     1     1     2     1     2     5     1     1     2     1
## Proportion 0.115 0.038 0.038 0.077 0.038 0.077 0.192 0.038 0.038 0.077 0.038
##                                               
## Value         19    24    25    26    28    33
## Frequency      1     1     1     1     1     1
## Proportion 0.038 0.038 0.038 0.038 0.038 0.038
## --------------------------------------------------------------------------------
## Region 
##        n  missing distinct 
##       26        0        5 
## 
## lowest : ASIA      AUSTRALIA EURASIA   EUROPE    NORTH_A  
## highest: ASIA      AUSTRALIA EURASIA   EUROPE    NORTH_A  
##                                                             
## Value           ASIA AUSTRALIA   EURASIA    EUROPE   NORTH_A
## Frequency          3         1         5        15         2
## Proportion     0.115     0.038     0.192     0.577     0.077
## --------------------------------------------------------------------------------

#What is median of number of gold, silver, bronze and total medals,Also look at the mean and total number of G, S, B and T medals?

IQR(winter_olympic$Gold)
## [1] 4.75
min(winter_olympic$Gold)
## [1] 0
max(winter_olympic$Gold)
## [1] 13
mean(winter_olympic$Gold)
## [1] 3.807692
var(winter_olympic$Gold)
## [1] 14.64154
sd(winter_olympic$Gold)
## [1] 3.826426
IQR(winter_olympic$Silver)
## [1] 4.5
min(winter_olympic$Silver)
## [1] 0
max(winter_olympic$Silver)
## [1] 11
mean(winter_olympic$Silver)
## [1] 3.730769
var(winter_olympic$Silver)
## [1] 9.644615
sd(winter_olympic$Silver)
## [1] 3.105578
IQR(winter_olympic$Bronze)
## [1] 4.75
min(winter_olympic$Bronze)
## [1] 0
max(winter_olympic$Bronze)
## [1] 12
mean(winter_olympic$Bronze)
## [1] 3.807692
var(winter_olympic$Bronze)
## [1] 11.20154
sd(winter_olympic$Bronze)
## [1] 3.34687
IQR(winter_olympic$Total)
## [1] 12.25
min(winter_olympic$Total)
## [1] 1
max(winter_olympic$Total)
## [1] 33
mean(winter_olympic$Total)
## [1] 11.34615
var(winter_olympic$Total)
## [1] 86.79538
sd(winter_olympic$Total)
## [1] 9.316404
library(psych)
## 
## 载入程辑包:'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
skew(winter_olympic$Gold)
## [1] 0.87898
max(mean(winter_olympic$Total))
## [1] 11.34615
grouping(winter_olympic$Region)
##  [1]  1  8 20 23 26  2  5  6  7  9 10 11 14 15 16 18 19 21 22 25  3  4 12 13 17
## [26] 24
## attr(,"ends")
## [1]  5 20 22 25 26
## attr(,"maxgrpn")
## [1] 15
## attr(,"class")
## [1] "grouping" "integer"

#How many countries are in this Geographic Region?

describeBy(winter_olympic$Total,group=winter_olympic$Region)
## 
##  Descriptive statistics by group 
## group: ASIA
##    vars n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 3 8.33 0.58      8    8.33   0   8   9     1 0.38    -2.33 0.33
## ------------------------------------------------------------ 
## group: AUSTRALIA
##    vars n mean sd median trimmed mad min max range skew kurtosis se
## X1    1 1    3 NA      3       3   0   3   3     0   NA       NA NA
## ------------------------------------------------------------ 
## group: EURASIA
##    vars n mean    sd median trimmed  mad min max range skew kurtosis   se
## X1    1 5  9.2 13.44      4     9.2 2.97   1  33    32 1.02    -0.99 6.01
## ------------------------------------------------------------ 
## group: EUROPE
##    vars  n mean   sd median trimmed   mad min max range skew kurtosis   se
## X1    1 15 11.2 7.85      8   10.85 10.38   1  26    25 0.43    -1.12 2.03
## ------------------------------------------------------------ 
## group: NORTH_A
##    vars n mean   sd median trimmed  mad min max range skew kurtosis  se
## X1    1 2 26.5 2.12   26.5    26.5 2.22  25  28     3    0    -2.75 1.5
north_america <- subset(winter_olympic, Region == "NoRTH_A")
north_america
## # A tibble: 0 × 7
## # … with 7 variables: Rank <dbl>, NOC <chr>, Gold <dbl>, Silver <dbl>,
## #   Bronze <dbl>, Total <dbl>, Region <chr>
europe <- subset(winter_olympic, Region== "EUROPE")
europe
## # A tibble: 15 × 7
##     Rank NOC                    Gold Silver Bronze Total Region
##    <dbl> <chr>                 <dbl>  <dbl>  <dbl> <dbl> <chr> 
##  1     2  Norway (NOR)            11      5     10    26 EUROPE
##  2     5  Netherlands (NED)        8      7      9    24 EUROPE
##  3     6  Germany (GER)            8      6      5    19 EUROPE
##  4     7  Switzerland (SUI)        6      3      2    11 EUROPE
##  5     9  Austria (AUT)            4      8      5    17 EUROPE
##  6    10  France (FRA)             4      4      7    15 EUROPE
##  7    11  Poland (POL)             4      1      1     6 EUROPE
##  8    14  Sweden (SWE)             2      7      6    15 EUROPE
##  9    15  Czech Republic (CZE)     2      4      2     8 EUROPE
## 10    16  Slovenia (SLO)           2      2      4     8 EUROPE
## 11    18  Finland (FIN)            1      3      1     5 EUROPE
## 12    19  Great Britain (GBR)      1      1      2     4 EUROPE
## 13    21  Slovakia (SVK)           1      0      0     1 EUROPE
## 14    22  Italy (ITA)              0      2      6     8 EUROPE
## 15    25  Croatia (CRO)            0      1      0     1 EUROPE
austrilia <-subset(winter_olympic, Region=="AUSTRALIA")
austrilia
## # A tibble: 1 × 7
##    Rank NOC               Gold Silver Bronze Total Region   
##   <dbl> <chr>            <dbl>  <dbl>  <dbl> <dbl> <chr>    
## 1    24  Australia (AUS)     0      2      1     3 AUSTRALIA
eurasia <-subset(winter_olympic, Region== "EURASIA")
eurasia
## # A tibble: 5 × 7
##    Rank NOC                Gold Silver Bronze Total Region 
##   <dbl> <chr>             <dbl>  <dbl>  <dbl> <dbl> <chr>  
## 1     1  Russia (RUS)*       13     11      9    33 EURASIA
## 2     8  Belarus (BLR)        5      0      1     6 EURASIA
## 3    20  Ukraine (UKR)        1      0      1     2 EURASIA
## 4    23  Latvia (LAT)         0      2      2     4 EURASIA
## 5    26  Kazakhstan (KAZ)     0      0      1     1 EURASIA
asia <-subset(winter_olympic, Region=="ASIA")
asia
## # A tibble: 3 × 7
##    Rank NOC                 Gold Silver Bronze Total Region
##   <dbl> <chr>              <dbl>  <dbl>  <dbl> <dbl> <chr> 
## 1    12  China (CHN)           3      4      2     9 ASIA  
## 2    13  South Korea (KOR)     3      3      2     8 ASIA  
## 3    17  Japan (JPN)           1      4      3     8 ASIA

#what is the max number of medals won? What country won the max?

max(winter_olympic$Total)
## [1] 33
subset(winter_olympic, winter_olympic$Total==33)
## # A tibble: 1 × 7
##    Rank NOC             Gold Silver Bronze Total Region 
##   <dbl> <chr>          <dbl>  <dbl>  <dbl> <dbl> <chr>  
## 1     1  Russia (RUS)*    13     11      9    33 EURASIA
table(winter_olympic$Total, winter_olympic$Region)
##     
##      ASIA AUSTRALIA EURASIA EUROPE NORTH_A
##   1     0         0       1      2       0
##   2     0         0       1      0       0
##   3     0         1       0      0       0
##   4     0         0       1      1       0
##   5     0         0       0      1       0
##   6     0         0       1      1       0
##   8     2         0       0      3       0
##   9     1         0       0      0       0
##   11    0         0       0      1       0
##   15    0         0       0      2       0
##   17    0         0       0      1       0
##   19    0         0       0      1       0
##   24    0         0       0      1       0
##   25    0         0       0      0       1
##   26    0         0       0      1       0
##   28    0         0       0      0       1
##   33    0         0       1      0       0

#explore correlations between total medals and number of gold and bronze

cor(winter_olympic$Total,winter_olympic$Gold)
## [1] 0.9186698
cor(winter_olympic$Total, winter_olympic$Silver)
## [1] 0.8992183
cor(winter_olympic$Gold,winter_olympic$Silver)
## [1] 0.7427381

#what is the max number of medals won? what country won the max?

max(winter_olympic$Total)
## [1] 33
subset(winter_olympic, winter_olympic$Total==33)
## # A tibble: 1 × 7
##    Rank NOC             Gold Silver Bronze Total Region 
##   <dbl> <chr>          <dbl>  <dbl>  <dbl> <dbl> <chr>  
## 1     1  Russia (RUS)*    13     11      9    33 EURASIA
table(winter_olympic$Total,winter_olympic$Region)
##     
##      ASIA AUSTRALIA EURASIA EUROPE NORTH_A
##   1     0         0       1      2       0
##   2     0         0       1      0       0
##   3     0         1       0      0       0
##   4     0         0       1      1       0
##   5     0         0       0      1       0
##   6     0         0       1      1       0
##   8     2         0       0      3       0
##   9     1         0       0      0       0
##   11    0         0       0      1       0
##   15    0         0       0      2       0
##   17    0         0       0      1       0
##   19    0         0       0      1       0
##   24    0         0       0      1       0
##   25    0         0       0      0       1
##   26    0         0       0      1       0
##   28    0         0       0      0       1
##   33    0         0       1      0       0

#what is the correlation between rank and total mmedals? is this expected or surprsing?

cor(winter_olympic$Rank,winter_olympic$Total)
## [1] -0.874864

##Exercise 4.2

library(readr)
gdp <- read_csv("C:/Users/ywang/RStudio/R/gdp.csv")
## Rows: 264 Columns: 60
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Country Name, Country Code
## dbl (58): 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(gdp)
gdp$"2017" <-gdp$"2017" / 1000000000
mean(gdp$"2017", na.rm = TRUE)
## [1] 2848.129
median(gdp$"2017",na.rm = TRUE)
## [1] 59.44777
range(gdp$"2017", na.rm = TRUE)
## [1] 3.973132e-02 8.068379e+04
quantile(gdp$"2017", na.rm = TRUE)
##           0%          25%          50%          75%         100% 
## 3.973132e-02 1.150900e+01 5.944777e+01 7.175373e+02 8.068379e+04