A. THOERY

1. Input data set

setwd("~/OneDrive - UGent/Ugent_IMAQUA Semester1_2022/2. Applied Statistic/Excercise/Raw data")
getwd()
## [1] "/Users/hien/Library/CloudStorage/OneDrive-UGent/Ugent_IMAQUA Semester1_2022/2. Applied Statistic/Excercise/Raw data"
survey=read.csv("surveyAS2008.csv")
library(MASS)
data(cats) 

2. Categorical variables

str(survey)
## 'data.frame':    50 obs. of  12 variables:
##  $ gender    : chr  "male" "male" "male" "male" ...
##  $ age       : num  24 36 31 30 24 22 25 23 22 22 ...
##  $ distance  : num  5 3 60 2 2 1 3.5 60 2 6 ...
##  $ traveltime: int  10 5 50 20 5 5 50 60 5 18 ...
##  $ transport : chr  "bike" "public" "public" "foot" ...
##  $ rice      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ potatoes  : int  7 4 4 3 2 2 2 7 3 4 ...
##  $ pasta     : int  7 2 3 4 2 2 3 1 2 2 ...
##  $ bread     : int  7 7 6 7 7 7 7 7 7 7 ...
##  $ cereals   : int  7 5 2 4 0 3 7 1 5 1 ...
##  $ coffee    : num  1 0 4 1 7 0 1 2 1 1 ...
##  $ sports    : num  6 4 0 2 0 1 1 3 6 3 ...
head(survey)
##   gender age distance traveltime transport rice potatoes pasta bread cereals
## 1   male  24        5         10      bike    0        7     7     7       7
## 2   male  36        3          5    public    1        4     2     7       5
## 3   male  31       60         50    public    1        4     3     6       2
## 4   male  30        2         20      foot    1        3     4     7       4
## 5   male  24        2          5      bike    1        2     2     7       0
## 6   male  22        1          5      foot    1        2     2     7       3
##   coffee sports
## 1      1      6
## 2      0      4
## 3      4      0
## 4      1      2
## 5      7      0
## 6      0      1
dim(survey) # raw and column
## [1] 50 12
as.logical(as.numeric(cats$Sex))
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
levels(cats$Sex) # categorical variable
## [1] "F" "M"
# Descriptive analysis
## You can use table
table(cats$Sex) #absoblute
## 
##  F  M 
## 47 97
table(cats$Sex)/nrow(cats) #relative
## 
##         F         M 
## 0.3263889 0.6736111
## or crosstable
require(gmodels)
CrossTable(cats$Sex)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  144 
## 
##  
##           |         F |         M | 
##           |-----------|-----------|
##           |        47 |        97 | 
##           |     0.326 |     0.674 | 
##           |-----------|-----------|
## 
## 
## 
## 
## or summary function
summary(cats$Sex)
##  F  M 
## 47 97

3. Continuous variables

require(MASS)
head(cats)
##   Sex Bwt Hwt
## 1   F 2.0 7.0
## 2   F 2.0 7.4
## 3   F 2.0 9.5
## 4   F 2.1 7.2
## 5   F 2.1 7.3
## 6   F 2.1 7.6
str(cats)
## 'data.frame':    144 obs. of  3 variables:
##  $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Bwt: num  2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
##  $ Hwt: num  7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
dim(cats)
## [1] 144   3
bwt <- cats$Bwt
hwt <- cats$Hwt
## decriptive continuous variable by using
### function
quantile(bwt)
##    0%   25%   50%   75%  100% 
## 2.000 2.300 2.700 3.025 3.900
IQR(bwt)
## [1] 0.725
quantile(bwt)[4] - quantile(bwt)[2] #75%
##   75% 
## 0.725
quantile(bwt)[3] - quantile(bwt)[2] #50%
## 50% 
## 0.4
quantile(bwt)[2] - quantile(bwt)[2] #25%
## 25% 
##   0
max(bwt)
## [1] 3.9
min(bwt)
## [1] 2
mean(bwt)
## [1] 2.723611
sd(bwt)
## [1] 0.4853066
var(bwt)
## [1] 0.2355225
### or summary
summary(bwt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.300   2.700   2.724   3.025   3.900
### or describe
require(psych)
describe(bwt)
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 144 2.72 0.49    2.7    2.69 0.59   2 3.9   1.9 0.47    -0.72 0.04
describe(bwt,ranges=F) #more clear
##    vars   n mean   sd skew kurtosis   se
## X1    1 144 2.72 0.49 0.47    -0.72 0.04
describe.by(bwt,cats$Sex)#describe + sort
## 
##  Descriptive statistics by group 
## group: F
##    vars  n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 47 2.36 0.27    2.3    2.33 0.3   2   3     1 0.87    -0.21 0.04
## ------------------------------------------------------------ 
## group: M
##    vars  n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 97  2.9 0.47    2.9    2.89 0.59   2 3.9   1.9 0.13     -0.8 0.05
head(cats)
##   Sex Bwt Hwt
## 1   F 2.0 7.0
## 2   F 2.0 7.4
## 3   F 2.0 9.5
## 4   F 2.1 7.2
## 5   F 2.1 7.3
## 6   F 2.1 7.6
summary(cats[,c(2:3)]) #subset data, lấy tất cả các dòng và chọn cột 2 và 3
##       Bwt             Hwt       
##  Min.   :2.000   Min.   : 6.30  
##  1st Qu.:2.300   1st Qu.: 8.95  
##  Median :2.700   Median :10.10  
##  Mean   :2.724   Mean   :10.63  
##  3rd Qu.:3.025   3rd Qu.:12.12  
##  Max.   :3.900   Max.   :20.50
summary(cats[,-c(2)]) # lấy tất cả các dòng, bỏ cột 2, cột còn lại lấy hết
##  Sex         Hwt       
##  F:47   Min.   : 6.30  
##  M:97   1st Qu.: 8.95  
##         Median :10.10  
##         Mean   :10.63  
##         3rd Qu.:12.12  
##         Max.   :20.50
head(survey)
##   gender age distance traveltime transport rice potatoes pasta bread cereals
## 1   male  24        5         10      bike    0        7     7     7       7
## 2   male  36        3          5    public    1        4     2     7       5
## 3   male  31       60         50    public    1        4     3     6       2
## 4   male  30        2         20      foot    1        3     4     7       4
## 5   male  24        2          5      bike    1        2     2     7       0
## 6   male  22        1          5      foot    1        2     2     7       3
##   coffee sports
## 1      1      6
## 2      0      4
## 3      4      0
## 4      1      2
## 5      7      0
## 6      0      1

B. EXERCISE

1. If you haven’t done so previously, download the surveyAS2008.csv dataset from Ufora and load it into R. Investigate the structure of the dataset, i.e. make sure you know its dimensions, which variables it has,

setwd("~/OneDrive - UGent/Ugent_IMAQUA Semester1_2022/2. Applied Statistic/Excercise/Raw data")
survey=read.csv("surveyAS2008.csv")
str(survey)
## 'data.frame':    50 obs. of  12 variables:
##  $ gender    : chr  "male" "male" "male" "male" ...
##  $ age       : num  24 36 31 30 24 22 25 23 22 22 ...
##  $ distance  : num  5 3 60 2 2 1 3.5 60 2 6 ...
##  $ traveltime: int  10 5 50 20 5 5 50 60 5 18 ...
##  $ transport : chr  "bike" "public" "public" "foot" ...
##  $ rice      : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ potatoes  : int  7 4 4 3 2 2 2 7 3 4 ...
##  $ pasta     : int  7 2 3 4 2 2 3 1 2 2 ...
##  $ bread     : int  7 7 6 7 7 7 7 7 7 7 ...
##  $ cereals   : int  7 5 2 4 0 3 7 1 5 1 ...
##  $ coffee    : num  1 0 4 1 7 0 1 2 1 1 ...
##  $ sports    : num  6 4 0 2 0 1 1 3 6 3 ...
dim(survey)
## [1] 50 12
head(survey)
##   gender age distance traveltime transport rice potatoes pasta bread cereals
## 1   male  24        5         10      bike    0        7     7     7       7
## 2   male  36        3          5    public    1        4     2     7       5
## 3   male  31       60         50    public    1        4     3     6       2
## 4   male  30        2         20      foot    1        3     4     7       4
## 5   male  24        2          5      bike    1        2     2     7       0
## 6   male  22        1          5      foot    1        2     2     7       3
##   coffee sports
## 1      1      6
## 2      0      4
## 3      4      0
## 4      1      2
## 5      7      0
## 6      0      1

2. Give summary information for all variables, except the starch variables (rice, potatoes, pasta, bread and cereals).

summary(survey[,-c(6:10)]) #summary "survey" taking all column and row (without row 6th to 10th)
##     gender               age           distance       traveltime  
##  Length:50          Min.   :22.00   Min.   :  0.5   Min.   : 5.0  
##  Class :character   1st Qu.:25.00   1st Qu.:  2.0   1st Qu.:10.0  
##  Mode  :character   Median :27.00   Median :  3.0   Median :19.0  
##                     Mean   :27.69   Mean   : 11.2   Mean   :21.8  
##                     3rd Qu.:30.00   3rd Qu.:  5.0   3rd Qu.:30.0  
##                     Max.   :37.00   Max.   :150.0   Max.   :90.0  
##   transport             coffee          sports      
##  Length:50          Min.   :0.000   Min.   : 0.000  
##  Class :character   1st Qu.:0.000   1st Qu.: 0.000  
##  Mode  :character   Median :1.000   Median : 1.000  
##                     Mean   :1.876   Mean   : 1.930  
##                     3rd Qu.:2.750   3rd Qu.: 2.875  
##                     Max.   :7.000   Max.   :10.000

3. Give the frequency table for the transport variable. Do this twice: once with absolute frequencies, once with relative frequencies.

table(survey$transport) #absoblute frequency
## 
##   bike   foot public 
##      8     21     21
table(survey$transport)/nrow(survey) #relative frequency and nrow means number of row
## 
##   bike   foot public 
##   0.16   0.42   0.42

4. Give the mean and standard deviation for the travel time variable. How does the mean compare to the median?

mean(survey$traveltime)
## [1] 21.8
sd(survey$traveltime)
## [1] 16.81836
median(survey$traveltime)
## [1] 19

5. Give the mean and standard deviation for the travel time variable, sepa- rately for female and male students. How do male and female students compare?

mean(survey$traveltime)
## [1] 21.8
male_mean=survey$traveltime[survey$gender=="male"] #extract male in travel time variable
female_mean=survey$traveltime[survey$gender=="female"] ##extract female in travel time variable
mean(male_mean)
## [1] 17.33333
mean(female_mean)
## [1] 28.5
# create a table by using data.frame
df <- data.frame(team=rep(c('A', 'B', 'C', 'D'), each=4),
                 pos=rep(c('G', 'F'), times=8),
                 points=round(runif(16, 4, 20),0))
head(df)
##   team pos points
## 1    A   G     14
## 2    A   F     18
## 3    A   G     13
## 4    A   F     16
## 5    B   G      9
## 6    B   F     17

6. Give the important summary statistics for the sport variable, and inter- pret them.

summary(survey$sports)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.930   2.875  10.000

7. Give the important summary statistics for the sport variable, separately for female and male students. How do male and female students com- pare?

summary(survey$sports[survey$gender=="male"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.500   2.217   3.000  10.000
summary(survey$sports[survey$gender=="female"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     1.0     1.5     2.0     7.0