library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
data_path <- "C:/Users/shanata/Downloads/smoking_driking_dataset_Ver01.csv"
data <- read.csv(data_path)
data |>
ggplot() +
geom_boxplot(mapping = aes(y = hemoglobin, x = sex)) +
labs(x = "Gender",
y = "Hemoglobin levels") +
theme_minimal()
#### Null hypothesis:
categorical variable: Gender continous variable : Hemoglobin levels
H0 : The average hemoglobin levels is same across both the genders.
m <- aov(hemoglobin ~ sex, data = data)
summary(m)
## Df Sum Sq Mean Sq F value Pr(>F)
## sex 1 1115932 1115932 804956 <2e-16 ***
## Residuals 991344 1374326 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
With such a small p-value, we can conclude that there is a statistically significant difference in mean hemoglobin levels between the two genders. In other words, the average hemoglobin levels are not the same for both genders.
Independent variable: Weight Dependent variable : waistline
Trying to predict waistline based on weight.
data |>
ggplot(mapping = aes(x = weight, y =waistline)) +
geom_point(size = 2, color = 'darkblue') +
theme_minimal()
We can see that they have a linear relationship
model <- lm(waistline ~ weight, data)
model$coefficients
## (Intercept) weight
## 43.0497113 0.6033692
Null Hypothesis (H0): There is no relationship between weight and waistline. Alternative Hypothesis (H1): There is a significant relationship between weight and waistline.
summary_model <- summary(model)
p_value_weight <- summary_model$coefficients["weight", "Pr(>|t|)"]
alpha <- 0.05
if (p_value_weight < alpha) {
cat("Reject the null hypothesis: There is a significant relationship between weight and waistline.\n")
} else {
cat("Fail to reject the null hypothesis: There is no significant relationship between weight and waistline.\n")
}
## Reject the null hypothesis: There is a significant relationship between weight and waistline.
Therefore, we can conclude that there is a significant relationship between weight and waistline.
par(mfrow=c(2,2))
plot(model, which = 1)
plot(model, which = 2)
plot(model, which = 3)
plot(model, which = 4)
par(mfrow=c(1,1))
I want to find out how gender affects the waistline and weight relationship
data_grouped <-
data |>
group_by(sex) |>
summarise(mean_waistline = mean(waistline))
data |>
ggplot() +
facet_wrap(vars(sex), labeller = label_both) +
geom_point(mapping = aes(x = weight, y = waistline)) +
geom_hline(data =data_grouped,
mapping = aes(yintercept = mean_waistline),
color = 'darkorange', linetype = 'dashed') +
labs(title = "Waistline wrt weight",
x = "weight", y = "waistline") +
theme_minimal()
model <- lm(waistline ~ weight + sex,data)
model$coefficients
## (Intercept) weight sexMale
## 42.1779396 0.6249516 -0.9303926
When considering only weight as an independent variable, a one-unit increase in weight is associated with a 0.625 centimeter increase in waistline, assuming sex is held constant.
When comparing males (sexMale = 1) to females (sexMale = 0) with all other factors held constant, males tend to have a waistline that is 0.930 centimeters smaller than females.
Trying to smoking and Drinking status
model <- lm(waistline ~ weight + sex + SMK_stat_type_cd+ DRK_YN,data)
model$coefficients
## (Intercept) weight sexMale SMK_stat_type_cd
## 42.4950483 0.6306037 -0.2701830 -0.0695271
## DRK_YNY
## -1.8278168
Weight (0.631): For each additional kilogram in weight, waistline is expected to increase by approximately 0.631 centimeters.
SexMale (-0.270): On average, males tend to have a waistline approximately 0.270 centimeters smaller than females, controlling for other variables.
SMK_stat_type_cd (-0.070): Differences in smoking status are associated with a 0.070 centimeter reduction in waistline, on average, when other factors are held constant.
DRK_YNY (-1.828): Individuals with different drinking habits tend to have a waistline difference of approximately 1.828 centimeters, on average, controlling for all other predictors.