library(readxl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

HOÀNG ĐĂNG KHÁNH

Nhập dữ liệu

dat <- read_excel("set1.xlsx")
dat <- dat %>% filter(Phu_trach == "Khánh")
dat$Pclass <- as.factor(dat$Pclass)
dat$Survival <- factor(dat$Survival, ordered = T, labels = c("No", "Yes"))
dat$Sex <- as.factor(dat$Sex)
dat$Embarked <- as.factor(dat$Embarked)
dat$Age <- as.integer(dat$Age)

Thống kê mô tả

describe(dat)
##            vars  n   mean    sd median trimmed   mad    min max  range  skew
## ID            1 50 298.50 14.58  298.5  298.50 18.53 274.00 323  49.00  0.00
## Survival*     2 50   1.40  0.49    1.0    1.38  0.00   1.00   2   1.00  0.40
## Pclass*       3 50   2.30  0.81    3.0    2.38  0.00   1.00   3   2.00 -0.58
## Name*         4 50  25.50 14.58   25.5   25.50 18.53   1.00  50  49.00  0.00
## Sex*          5 50   1.62  0.49    2.0    1.65  0.00   1.00   2   1.00 -0.48
## Age           6 50  26.80 11.58   25.0   27.05  8.90   1.00  60  59.00  0.01
## SibSp         7 50   0.48  0.97    0.0    0.25  0.00   0.00   5   5.00  2.72
## Parch         8 50   0.42  0.86    0.0    0.23  0.00   0.00   4   4.00  2.13
## Fare          9 50  38.91 58.75   13.0   23.90  7.72   4.01 263 258.99  2.39
## Embarked*    10 50   1.74  0.44    2.0    1.80  0.00   1.00   2   1.00 -1.06
## Phu_trach*   11 50   1.00  0.00    1.0    1.00  0.00   1.00   1   0.00   NaN
##            kurtosis   se
## ID            -1.27 2.06
## Survival*     -1.88 0.07
## Pclass*       -1.28 0.12
## Name*         -1.27 2.06
## Sex*          -1.80 0.07
## Age            0.64 1.64
## SibSp          8.25 0.14
## Parch          4.46 0.12
## Fare           5.17 8.31
## Embarked*     -0.89 0.06
## Phu_trach*      NaN 0.00
dat$discrete_age <- cut_width(dat$Age, 12)

dat <- dat %>% mutate(discrete_age = case_when(
  discrete_age == "[-6,6]" ~ "< 6",
  discrete_age == "(6,18]" ~ "6 - 18",
  discrete_age == "(18,30]" ~ "18 - 30",
  discrete_age == "(30,42]" ~ "30 - 42",
  discrete_age == "(42,54]" ~ "42 - 54",
  discrete_age == "(54,66]" ~ "> 54"
))

dat$discrete_age <- as.factor(dat$discrete_age)

attributes(dat$discrete_age)$levels <- c("< 6", "6 - 18", "18 - 30", "30 - 42", "42 - 54", "> 54")
ggplot(dat, aes(x = discrete_age)) + 
  geom_histogram(stat = "count", fill = "#124B92") + 
  theme_minimal() + 
  labs(x = "Age", y = "Count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

freq_tab <- as.data.frame(table(dat$discrete_age)) %>% left_join(
 as.data.frame(prop.table(table(dat$discrete_age))) %>% rename(Prop = Freq)
)
## Joining, by = "Var1"
freq_tab
##      Var1 Freq Prop
## 1     < 6    4 0.08
## 2  6 - 18    1 0.02
## 3 18 - 30   26 0.52
## 4 30 - 42   13 0.26
## 5 42 - 54    2 0.04
## 6    > 54    4 0.08
library(ggrepel)

ggplot(freq_tab, aes(x = "", y = Prop, fill = Var1)) +
  geom_col() +
  coord_polar(theta = "y") + 
  geom_label(
    aes(label = paste(Prop * 100, "%")), 
    position = position_stack(vjust = 0.5),
    show.legend = FALSE
  ) + 
  scale_fill_brewer(palette = "Pastel1") + 
  theme_void() +
  guides(
    fill = guide_legend(title = "Age", label = T)
  )

dat %>% 
  # mutate(Survival = factor(Survival, labels = c("No", "Yes"))) %>% 
  group_by(discrete_age, Survival) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(discrete_age, count, fill = Survival)) + 
    geom_bar(stat = "identity", position = 'dodge') +
    scale_fill_brewer(palette = "Pastel2") +
    theme_minimal() + 
    labs(x = "Age", y = "Count")
## `summarise()` has grouped output by 'discrete_age'. You can override using the `.groups` argument.

Hồi quy

m1 <- lm(Fare ~ Age, data = dat)
summary(m1)
## 
## Call:
## lm(formula = Fare ~ Age, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.215 -30.660 -25.958   5.253 225.557 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  24.8639    21.2254   1.171    0.247
## Age           0.5241     0.7281   0.720    0.475
## 
## Residual standard error: 59.04 on 48 degrees of freedom
## Multiple R-squared:  0.01068,    Adjusted R-squared:  -0.009931 
## F-statistic: 0.5181 on 1 and 48 DF,  p-value: 0.4751
anova(m1)
## Analysis of Variance Table
## 
## Response: Fare
##           Df Sum Sq Mean Sq F value Pr(>F)
## Age        1   1806  1806.0  0.5181 0.4751
## Residuals 48 167305  3485.5
ggplot(dat, aes(Age, Fare)) + 
  geom_point(col = "#777777") + 
  geom_smooth(method = "lm", se = F) + 
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

LÊ QUANG ANH

Nhập dữ liệu

dat <- read_excel("set1.xlsx")
dat <- dat %>% filter(Phu_trach == "anh")
dat$Pclass <- as.factor(dat$Pclass)
dat$Survival <- factor(dat$Survival, ordered = T, labels = c("No", "Yes"))
dat$Sex <- as.factor(dat$Sex)
dat$Embarked <- as.factor(dat$Embarked)
dat$Age <- as.integer(dat$Age)

Thống kê mô tả

describe(dat)
##            vars  n  mean    sd median trimmed   mad  min max  range  skew
## ID            1 50 25.50 14.58  25.50   25.50 18.53 1.00  50  49.00  0.00
## Survival*     2 50  1.40  0.49   1.00    1.38  0.00 1.00   2   1.00  0.40
## Pclass*       3 50  2.30  0.84   3.00    2.38  0.00 1.00   3   2.00 -0.59
## Name*         4 50 25.50 14.58  25.50   25.50 18.53 1.00  50  49.00  0.00
## Sex*          5 50  1.50  0.51   1.50    1.50  0.74 1.00   2   1.00  0.00
## Age           6 50 26.88 16.33  26.50   25.77 17.05 2.00  66  64.00  0.50
## SibSp         7 50  0.92  1.23   1.00    0.67  1.48 0.00   5   5.00  1.58
## Parch         8 50  0.56  1.15   0.00    0.30  0.00 0.00   5   5.00  2.49
## Fare          9 50 31.25 39.47  21.04   23.71 18.12 7.23 263 255.77  4.13
## Embarked*    10 50  2.54  0.81   3.00    2.67  0.00 1.00   3   2.00 -1.24
## Phu_trach*   11 50  1.00  0.00   1.00    1.00  0.00 1.00   1   0.00   NaN
##            kurtosis   se
## ID            -1.27 2.06
## Survival*     -1.88 0.07
## Pclass*       -1.35 0.12
## Name*         -1.27 2.06
## Sex*          -2.04 0.07
## Age           -0.34 2.31
## SibSp          1.86 0.17
## Parch          6.36 0.16
## Fare          21.00 5.58
## Embarked*     -0.34 0.12
## Phu_trach*      NaN 0.00
dat$discrete_age <- cut_width(dat$Age, 12)

dat <- dat %>% mutate(discrete_age = case_when(
  discrete_age == "[-6,6]" ~ "< 6",
  discrete_age == "(6,18]" ~ "6 - 18",
  discrete_age == "(18,30]" ~ "18 - 30",
  discrete_age == "(30,42]" ~ "30 - 42",
  discrete_age == "(42,54]" ~ "42 - 54",
  discrete_age == "(54,66]" ~ "> 54"
))

dat$discrete_age <- as.factor(dat$discrete_age)

attributes(dat$discrete_age)$levels <- c("< 6", "6 - 18", "18 - 30", "30 - 42", "42 - 54", "> 54")
ggplot(dat, aes(x = discrete_age)) + 
  geom_histogram(stat = "count", fill = "#124B92") + 
  theme_minimal() + 
  labs(x = "Age", y = "Count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

freq_tab <- as.data.frame(table(dat$discrete_age)) %>% left_join(
 as.data.frame(prop.table(table(dat$discrete_age))) %>% rename(Prop = Freq)
)
## Joining, by = "Var1"
freq_tab
##      Var1 Freq Prop
## 1     < 6    6 0.12
## 2  6 - 18    4 0.08
## 3 18 - 30   17 0.34
## 4 30 - 42   11 0.22
## 5 42 - 54    3 0.06
## 6    > 54    9 0.18
library(ggrepel)

ggplot(freq_tab, aes(x = "", y = Prop, fill = Var1)) +
  geom_col() +
  coord_polar(theta = "y") + 
  geom_label(
    aes(label = paste(Prop * 100, "%")), 
    position = position_stack(vjust = 0.5),
    show.legend = FALSE
  ) + 
  scale_fill_brewer(palette = "Pastel1") + 
  theme_void() +
  guides(
    fill = guide_legend(title = "Age", label = T)
  )

dat %>% 
  # mutate(Survival = factor(Survival, labels = c("No", "Yes"))) %>% 
  group_by(discrete_age, Survival) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(discrete_age, count, fill = Survival)) + 
    geom_bar(stat = "identity", position = 'dodge') +
    scale_fill_brewer(palette = "Pastel2") +
    theme_minimal() + 
    labs(x = "Age", y = "Count")
## `summarise()` has grouped output by 'discrete_age'. You can override using the `.groups` argument.

Hồi quy

m1 <- lm(Fare ~ Age, data = dat)
summary(m1)
## 
## Call:
## lm(formula = Fare ~ Age, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -30.401 -21.243  -9.594   4.002 233.693 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  24.6203    10.8872   2.261   0.0283 *
## Age           0.2467     0.3471   0.711   0.4807  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.67 on 48 degrees of freedom
## Multiple R-squared:  0.01041,    Adjusted R-squared:  -0.0102 
## F-statistic: 0.505 on 1 and 48 DF,  p-value: 0.4807
anova(m1)
## Analysis of Variance Table
## 
## Response: Fare
##           Df Sum Sq Mean Sq F value Pr(>F)
## Age        1    795  794.77   0.505 0.4807
## Residuals 48  75539 1573.72
ggplot(dat, aes(Age, Fare)) + 
  geom_point(col = "#777777") + 
  geom_smooth(method = "lm", se = F) + 
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

HOÀNG VĂN NAM

Nhập dữ liệu

dat <- read_excel("set1.xlsx")
dat <- dat %>% filter(Phu_trach == "nam")
dat$Pclass <- as.factor(dat$Pclass)
dat$Survival <- factor(dat$Survival, ordered = T, labels = c("No", "Yes"))
dat$Sex <- as.factor(dat$Sex)
dat$Embarked <- as.factor(dat$Embarked)
dat$Age <- as.integer(dat$Age)

Thống kê mô tả

describe(dat)
##            vars  n   mean    sd median trimmed   mad    min    max range  skew
## ID            1 50 389.50 14.58 389.50  389.50 18.53 365.00 414.00  49.0  0.00
## Survival*     2 50   1.42  0.50   1.00    1.40  0.00   1.00   2.00   1.0  0.31
## Pclass*       3 50   2.16  0.91   2.50    2.20  0.74   1.00   3.00   2.0 -0.31
## Name*         4 50  25.50 14.58  25.50   25.50 18.53   1.00  50.00  49.0  0.00
## Sex*          5 50   1.64  0.48   2.00    1.68  0.00   1.00   2.00   1.0 -0.57
## Age           6 50  33.72 16.00  32.50   33.42 15.57   0.00  71.00  71.0  0.24
## SibSp         7 50   0.36  0.83   0.00    0.20  0.00   0.00   5.00   5.0  3.73
## Parch         8 50   0.22  0.58   0.00    0.05  0.00   0.00   2.00   2.0  2.39
## Fare          9 50  30.47 32.41  17.58   23.80 14.36   7.05 151.55 144.5  1.78
## Embarked*    10 50   2.60  0.78   3.00    2.75  0.00   1.00   3.00   2.0 -1.45
## Phu_trach*   11 50   1.00  0.00   1.00    1.00  0.00   1.00   1.00   0.0   NaN
##            kurtosis   se
## ID            -1.27 2.06
## Survival*     -1.94 0.07
## Pclass*       -1.75 0.13
## Name*         -1.27 2.06
## Sex*          -1.71 0.07
## Age           -0.41 2.26
## SibSp         17.22 0.12
## Parch          4.23 0.08
## Fare           2.74 4.58
## Embarked*      0.21 0.11
## Phu_trach*      NaN 0.00
dat$discrete_age <- cut_width(dat$Age, 12)

dat <- dat %>% mutate(discrete_age = case_when(
  discrete_age == "[-6,6]" ~ "< 6",
  discrete_age == "(6,18]" ~ "6 - 18",
  discrete_age == "(18,30]" ~ "18 - 30",
  discrete_age == "(30,42]" ~ "30 - 42",
  discrete_age == "(42,54]" ~ "42 - 54",
  discrete_age == "(54,66]" ~ "> 54"
))

dat$discrete_age <- as.factor(dat$discrete_age)

attributes(dat$discrete_age)$levels <- c("< 6", "6 - 18", "18 - 30", "30 - 42", "42 - 54", "> 54")
ggplot(dat, aes(x = discrete_age)) + 
  geom_histogram(stat = "count", fill = "#124B92") + 
  theme_minimal() + 
  labs(x = "Age", y = "Count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

freq_tab <- as.data.frame(table(dat$discrete_age)) %>% left_join(
 as.data.frame(prop.table(table(dat$discrete_age))) %>% rename(Prop = Freq)
)
## Joining, by = "Var1"
freq_tab
##      Var1 Freq       Prop
## 1     < 6    2 0.04081633
## 2  6 - 18    5 0.10204082
## 3 18 - 30   17 0.34693878
## 4 30 - 42   12 0.24489796
## 5 42 - 54    8 0.16326531
## 6    > 54    5 0.10204082
library(ggrepel)

ggplot(freq_tab, aes(x = "", y = Prop, fill = Var1)) +
  geom_col() +
  coord_polar(theta = "y") + 
  geom_label(
    aes(label = paste(round(Prop * 100, 2), "%")), 
    position = position_stack(vjust = 0.5),
    show.legend = FALSE
  ) + 
  scale_fill_brewer(palette = "Pastel1") + 
  theme_void() +
  guides(
    fill = guide_legend(title = "Age", label = T)
  )

dat %>% 
  # mutate(Survival = factor(Survival, labels = c("No", "Yes"))) %>% 
  group_by(discrete_age, Survival) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(discrete_age, count, fill = Survival)) + 
    geom_bar(stat = "identity", position = 'dodge') +
    scale_fill_brewer(palette = "Pastel2") +
    theme_minimal() + 
    labs(x = "Age", y = "Count")
## `summarise()` has grouped output by 'discrete_age'. You can override using the `.groups` argument.

Hồi quy

m1 <- lm(Fare ~ Age, data = dat)
summary(m1)
## 
## Call:
## lm(formula = Fare ~ Age, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.460 -22.378 -13.137   2.807 121.010 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 30.754592  10.891043   2.824  0.00689 **
## Age         -0.008586   0.292327  -0.029  0.97669   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.75 on 48 degrees of freedom
## Multiple R-squared:  1.797e-05,  Adjusted R-squared:  -0.02081 
## F-statistic: 0.0008626 on 1 and 48 DF,  p-value: 0.9767
anova(m1)
## Analysis of Variance Table
## 
## Response: Fare
##           Df Sum Sq Mean Sq F value Pr(>F)
## Age        1      1    0.93   9e-04 0.9767
## Residuals 48  51478 1072.47
ggplot(dat, aes(Age, Fare)) + 
  geom_point(col = "#777777") + 
  geom_smooth(method = "lm", se = F) + 
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'