A. THOERY
2. Categorical variables
str(survey)
## 'data.frame': 50 obs. of 12 variables:
## $ gender : chr "male" "male" "male" "male" ...
## $ age : num 24 36 31 30 24 22 25 23 22 22 ...
## $ distance : num 5 3 60 2 2 1 3.5 60 2 6 ...
## $ traveltime: int 10 5 50 20 5 5 50 60 5 18 ...
## $ transport : chr "bike" "public" "public" "foot" ...
## $ rice : int 0 1 1 1 1 1 1 1 1 1 ...
## $ potatoes : int 7 4 4 3 2 2 2 7 3 4 ...
## $ pasta : int 7 2 3 4 2 2 3 1 2 2 ...
## $ bread : int 7 7 6 7 7 7 7 7 7 7 ...
## $ cereals : int 7 5 2 4 0 3 7 1 5 1 ...
## $ coffee : num 1 0 4 1 7 0 1 2 1 1 ...
## $ sports : num 6 4 0 2 0 1 1 3 6 3 ...
head(survey)
## gender age distance traveltime transport rice potatoes pasta bread cereals
## 1 male 24 5 10 bike 0 7 7 7 7
## 2 male 36 3 5 public 1 4 2 7 5
## 3 male 31 60 50 public 1 4 3 6 2
## 4 male 30 2 20 foot 1 3 4 7 4
## 5 male 24 2 5 bike 1 2 2 7 0
## 6 male 22 1 5 foot 1 2 2 7 3
## coffee sports
## 1 1 6
## 2 0 4
## 3 4 0
## 4 1 2
## 5 7 0
## 6 0 1
dim(survey) # raw and column
## [1] 50 12
as.logical(as.numeric(cats$Sex))
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
levels(cats$Sex) # categorical variable
## [1] "F" "M"
# Descriptive analysis
## You can use table
table(cats$Sex) #absoblute
##
## F M
## 47 97
table(cats$Sex)/nrow(cats) #relative
##
## F M
## 0.3263889 0.6736111
## or crosstable
require(gmodels)
CrossTable(cats$Sex)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 144
##
##
## | F | M |
## |-----------|-----------|
## | 47 | 97 |
## | 0.326 | 0.674 |
## |-----------|-----------|
##
##
##
##
## or summary function
summary(cats$Sex)
## F M
## 47 97
3. Continuous variables
require(MASS)
head(cats)
## Sex Bwt Hwt
## 1 F 2.0 7.0
## 2 F 2.0 7.4
## 3 F 2.0 9.5
## 4 F 2.1 7.2
## 5 F 2.1 7.3
## 6 F 2.1 7.6
str(cats)
## 'data.frame': 144 obs. of 3 variables:
## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
dim(cats)
## [1] 144 3
bwt <- cats$Bwt
hwt <- cats$Hwt
## decriptive continuous variable by using
### function
quantile(bwt)
## 0% 25% 50% 75% 100%
## 2.000 2.300 2.700 3.025 3.900
IQR(bwt)
## [1] 0.725
quantile(bwt)[4] - quantile(bwt)[2] #75%
## 75%
## 0.725
quantile(bwt)[3] - quantile(bwt)[2] #50%
## 50%
## 0.4
quantile(bwt)[2] - quantile(bwt)[2] #25%
## 25%
## 0
max(bwt)
## [1] 3.9
min(bwt)
## [1] 2
mean(bwt)
## [1] 2.723611
sd(bwt)
## [1] 0.4853066
var(bwt)
## [1] 0.2355225
### or summary
summary(bwt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.300 2.700 2.724 3.025 3.900
### or describe
require(psych)
describe(bwt)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 144 2.72 0.49 2.7 2.69 0.59 2 3.9 1.9 0.47 -0.72 0.04
describe(bwt,ranges=F) #more clear
## vars n mean sd skew kurtosis se
## X1 1 144 2.72 0.49 0.47 -0.72 0.04
describe.by(bwt,cats$Sex)#describe + sort
##
## Descriptive statistics by group
## group: F
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 47 2.36 0.27 2.3 2.33 0.3 2 3 1 0.87 -0.21 0.04
## ------------------------------------------------------------
## group: M
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 97 2.9 0.47 2.9 2.89 0.59 2 3.9 1.9 0.13 -0.8 0.05
head(cats)
## Sex Bwt Hwt
## 1 F 2.0 7.0
## 2 F 2.0 7.4
## 3 F 2.0 9.5
## 4 F 2.1 7.2
## 5 F 2.1 7.3
## 6 F 2.1 7.6
summary(cats[,c(2:3)]) #subset data, lấy tất cả các dòng và chọn cột 2 và 3
## Bwt Hwt
## Min. :2.000 Min. : 6.30
## 1st Qu.:2.300 1st Qu.: 8.95
## Median :2.700 Median :10.10
## Mean :2.724 Mean :10.63
## 3rd Qu.:3.025 3rd Qu.:12.12
## Max. :3.900 Max. :20.50
summary(cats[,-c(2)]) # lấy tất cả các dòng, bỏ cột 2, cột còn lại lấy hết
## Sex Hwt
## F:47 Min. : 6.30
## M:97 1st Qu.: 8.95
## Median :10.10
## Mean :10.63
## 3rd Qu.:12.12
## Max. :20.50
head(survey)
## gender age distance traveltime transport rice potatoes pasta bread cereals
## 1 male 24 5 10 bike 0 7 7 7 7
## 2 male 36 3 5 public 1 4 2 7 5
## 3 male 31 60 50 public 1 4 3 6 2
## 4 male 30 2 20 foot 1 3 4 7 4
## 5 male 24 2 5 bike 1 2 2 7 0
## 6 male 22 1 5 foot 1 2 2 7 3
## coffee sports
## 1 1 6
## 2 0 4
## 3 4 0
## 4 1 2
## 5 7 0
## 6 0 1
B. EXERCISE
1. If you haven’t done so previously, download the surveyAS2008.csv
dataset from Ufora and load it into R. Investigate the structure of the
dataset, i.e. make sure you know its dimensions, which variables it
has,
setwd("~/OneDrive - UGent/Ugent_IMAQUA Semester1_2022/2. Applied Statistic/Excercise/Raw data")
survey=read.csv("surveyAS2008.csv")
str(survey)
## 'data.frame': 50 obs. of 12 variables:
## $ gender : chr "male" "male" "male" "male" ...
## $ age : num 24 36 31 30 24 22 25 23 22 22 ...
## $ distance : num 5 3 60 2 2 1 3.5 60 2 6 ...
## $ traveltime: int 10 5 50 20 5 5 50 60 5 18 ...
## $ transport : chr "bike" "public" "public" "foot" ...
## $ rice : int 0 1 1 1 1 1 1 1 1 1 ...
## $ potatoes : int 7 4 4 3 2 2 2 7 3 4 ...
## $ pasta : int 7 2 3 4 2 2 3 1 2 2 ...
## $ bread : int 7 7 6 7 7 7 7 7 7 7 ...
## $ cereals : int 7 5 2 4 0 3 7 1 5 1 ...
## $ coffee : num 1 0 4 1 7 0 1 2 1 1 ...
## $ sports : num 6 4 0 2 0 1 1 3 6 3 ...
dim(survey)
## [1] 50 12
head(survey)
## gender age distance traveltime transport rice potatoes pasta bread cereals
## 1 male 24 5 10 bike 0 7 7 7 7
## 2 male 36 3 5 public 1 4 2 7 5
## 3 male 31 60 50 public 1 4 3 6 2
## 4 male 30 2 20 foot 1 3 4 7 4
## 5 male 24 2 5 bike 1 2 2 7 0
## 6 male 22 1 5 foot 1 2 2 7 3
## coffee sports
## 1 1 6
## 2 0 4
## 3 4 0
## 4 1 2
## 5 7 0
## 6 0 1
3. Give the frequency table for the transport variable. Do this
twice: once with absolute frequencies, once with relative
frequencies.
table(survey$transport) #absoblute frequency
##
## bike foot public
## 8 21 21
table(survey$transport)/nrow(survey) #relative frequency and nrow means number of row
##
## bike foot public
## 0.16 0.42 0.42
5. Give the mean and standard deviation for the travel time
variable, sepa- rately for female and male students. How do male and
female students compare?
mean(survey$traveltime)
## [1] 21.8
male_mean=survey$traveltime[survey$gender=="male"] #extract male in travel time variable
female_mean=survey$traveltime[survey$gender=="female"] ##extract female in travel time variable
mean(male_mean)
## [1] 17.33333
mean(female_mean)
## [1] 28.5
# create a table by using data.frame
df <- data.frame(team=rep(c('A', 'B', 'C', 'D'), each=4),
pos=rep(c('G', 'F'), times=8),
points=round(runif(16, 4, 20),0))
head(df)
## team pos points
## 1 A G 14
## 2 A F 18
## 3 A G 13
## 4 A F 16
## 5 B G 9
## 6 B F 17
6. Give the important summary statistics for the sport variable, and
inter- pret them.
summary(survey$sports)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.930 2.875 10.000