Đọc dữ liệu vào R
salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2023_Spring semester\\Data\\Professorial Salaries.csv")
dim(salary)
## [1] 397 9
t.test(NPubs ~ Sex, data = salary)
##
## Welch Two Sample t-test
##
## data: NPubs by Sex
## t = 0.93955, df = 46.045, p-value = 0.3524
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -2.601989 7.157497
## sample estimates:
## mean in group Female mean in group Male
## 20.20513 17.92737
wilcox.test(NPubs ~ Sex, data = salary)
##
## Wilcoxon rank sum test with continuity correction
##
## data: NPubs by Sex
## W = 7691.5, p-value = 0.2963
## alternative hypothesis: true location shift is not equal to 0
Kiểm tra phân bố
library(ggplot2)
p = ggplot(data = salary, aes(x = NPubs))
p = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p + geom_density(col="red")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(simpleboot)
## Warning: package 'simpleboot' was built under R version 4.3.2
## Simple Bootstrap Routines (1.1-7)
male = salary %>% filter(Sex == "Male")
female = salary %>% filter(Sex == "Female")
set.seed(1234)
b.means = two.boot(female$NPubs, male$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)
quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## -2.233298 1.960142 6.990984
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Rank | Sex, data = salary)
Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
---|---|---|---|
Rank | |||
AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
chisq.test(salary$Rank, salary$Sex, correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: salary$Rank and salary$Sex
## X-squared = 8.5259, df = 2, p-value = 0.01408
p = ggplot(data = salary, aes(x = Yrs.since.phd, y = Yrs.service))
p1 = p + geom_point() + geom_smooth() + labs(x = "Thời gian sau TS (năm)", y = "Thời gian làm việc (năm)") + ggtitle("Liên quan giữa thời gian làm việc và thời gian sau TS") + theme_bw()
p1
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Kiểm tra phân bố
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p = ggplot(data = salary, aes(x = Yrs.service))
p = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue") + geom_density(col="red")
p1 = ggplot(data = salary, aes(x = Yrs.since.phd))
p1 = p1 + geom_histogram(aes(y = ..density..), color = "white", fill = "blue") + geom_density(col="red")
grid.arrange(p, p1, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Đánh giá mối liên quan giữa thời gian sau TS và thời gian làm việc
cor.test(salary$Yrs.service, salary$Yrs.since.phd, method= "pearson")
##
## Pearson's product-moment correlation
##
## data: salary$Yrs.service and salary$Yrs.since.phd
## t = 43.524, df = 395, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8909977 0.9252353
## sample estimates:
## cor
## 0.9096491
p = ggplot(data = salary, aes(x = Yrs.since.phd, y = Yrs.service, fill = Sex, col = Sex))
p1 = p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3)) + labs(x = "Thời gian sau TS (năm)", y = "Thời gian làm việc (năm)") + ggtitle("Liên quan giữa thời gian làm việc và thời gian sau TS theo giới tính") + theme_bw()
p1