Data set for Homework 1, was found on Kaggle. Link to data set is https://www.kaggle.com/datasets/harshsingh2209/medical-insurance-payout.
Research question: Does average expenditure for customer differ between smokers and nonsmokers ?
mydata <- read.table("./expenses.csv", header=TRUE, sep=",", dec=".")
head(mydata)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
Explenation of data set
Unit of observation: customer of the insurance company
Sample size: 1338 units
mydata$sexF <- factor(mydata$sex,
levels = c("male", "female"),
labels = c("male", "female"))
mydata$smokerF <- factor(mydata$smoker,
levels = c("yes", "no"),
labels = c("yes", "no"))
head(mydata, 3)
## age sex bmi children smoker region charges sexF smokerF
## 1 19 female 27.90 0 yes southwest 16884.924 female yes
## 2 18 male 33.77 1 no southeast 1725.552 male no
## 3 28 male 33.00 3 no southeast 4449.462 male no
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(10)
mydata1 <- mydata %>% # for the purpose of exercise, I will take a random sample of 100 units
sample_n(100)
Data manipulations
mydata2 <- mydata1[,c(-6)]# excluding variable, that I will not use
head(mydata2)
## age sex bmi children smoker charges sexF smokerF
## 1 19 female 32.900 0 no 1748.774 female no
## 2 42 female 24.985 2 no 8017.061 female no
## 3 52 female 46.750 5 no 12592.534 female no
## 4 63 male 36.765 0 no 13981.850 male no
## 5 58 male 25.175 0 no 11931.125 male no
## 6 34 male 25.300 2 yes 18972.495 male yes
library(psych)
describe(mydata2 [,c(-2,-5,-7,-8)])
## vars n mean sd median trimmed mad min max
## age 1 100 42.04 14.56 42.50 42.40 17.05 18.00 64.0
## bmi 2 100 31.27 6.26 31.40 31.21 6.94 17.86 47.6
## children 3 100 1.22 1.39 1.00 1.00 1.48 0.00 5.0
## charges 4 100 14067.91 12524.07 10673.46 11842.21 7085.90 1141.45 60021.4
## range skew kurtosis se
## age 46.00 -0.17 -1.26 1.46
## bmi 29.74 0.12 -0.35 0.63
## children 5.00 1.08 0.55 0.14
## charges 58879.95 1.56 1.80 1252.41
summary(mydata2)
## age sex bmi children
## Min. :18.00 Length:100 Min. :17.86 Min. :0.00
## 1st Qu.:31.00 Class :character 1st Qu.:26.65 1st Qu.:0.00
## Median :42.50 Mode :character Median :31.40 Median :1.00
## Mean :42.04 Mean :31.27 Mean :1.22
## 3rd Qu.:54.25 3rd Qu.:36.08 3rd Qu.:2.00
## Max. :64.00 Max. :47.60 Max. :5.00
## smoker charges sexF smokerF
## Length:100 Min. : 1141 male :49 yes:19
## Class :character 1st Qu.: 5576 female:51 no :81
## Mode :character Median :10673
## Mean :14068
## 3rd Qu.:14222
## Max. :60021
Statistical hypothesis
# Install and load necessary packages
if (!requireNamespace("ggplot2", quietly = TRUE)) {
install.packages("ggplot2")
}
if (!requireNamespace("ggpubr", quietly = TRUE)) {
install.packages("ggpubr")
}
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.2
library(ggplot2)
smoker <- ggplot(mydata2[mydata2$smokerF=="yes", ], aes(x = charges)) +
theme_linedraw() +
geom_histogram(fill = "green", colour = "black") +
ylab("Frequency") +
ggtitle("Charges of smokers")
nonsmoker <- ggplot(mydata2[mydata2$smokerF=="no", ], aes(x = charges)) +
theme_linedraw() +
geom_histogram(fill = "red", colour = "black") +
ylab("Frequency") +
ggtitle("Charges of nonsmokers")
install.packages("ggpubr")
## Warning: package 'ggpubr' is in use and will not be installed
library(ggpubr)
ggarrange(smoker, nonsmoker,
ncol= 2, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(dplyr)
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.3.2
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata2 %>%
group_by(smokerF) %>%
shapiro_test(charges)
## # A tibble: 2 × 4
## smokerF variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 yes charges 0.923 0.128
## 2 no charges 0.834 0.0000000420
After plotting the histograms and preforming the Shapiro test, we can conclude that in group nonsmokers variable is not normally distributed. We can reject the H0 at p-value < 0.001.
Assumption of normality is violated but I will however show parametric test for the purpose of the practice.
t.test(mydata2$charges ~ mydata2$smokerF,
paired = FALSE,
var.equal = FALSE,
alternative = "two.sided")
##
## Welch Two Sample t-test
##
## data: mydata2$charges by mydata2$smokerF
## t = 8.2836, df = 20.461, p-value = 5.708e-08
## alternative hypothesis: true difference in means between group yes and group no is not equal to 0
## 95 percent confidence interval:
## 18304.42 30602.34
## sample estimates:
## mean in group yes mean in group no
## 33875.151 9421.768
library(effectsize)
##
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
##
## cohens_d, eta_squared
## The following object is masked from 'package:psych':
##
## phi
effectsize::cohens_d(mydata2$charges ~ mydata2$smokerF,
pooled_sd = FALSE)
## Cohen's d | 95% CI
## ------------------------
## 2.45 | [1.49, 3.38]
##
## - Estimated using un-pooled SD.
interpret_cohens_d(2.45, rules = "sawilowsky2009")
## [1] "huge"
## (Rules: sawilowsky2009)
Because normality assumption is violated I need to do non-parametric Wilcoxson Rank Sum Test.
wilcox.test(mydata2$charges ~ mydata2$smokerF,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon rank sum test
##
## data: mydata2$charges by mydata2$smokerF
## W = 1504, p-value = 1.092e-10
## alternative hypothesis: true location shift is not equal to 0
We can reject the null hypothesis at p < 0.001.
library(effectsize)
effectsize(wilcox.test(mydata2$charges ~ mydata2$smokerF,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided"))
## r (rank biserial) | 95% CI
## --------------------------------
## 0.95 | [0.92, 0.97]
interpret_rank_biserial(0.95)
## [1] "very large"
## (Rules: funder2019)
Conclusion
Based on the sample data, we find that there is difference in average expenditures for smokers and nonsmokers (p< 0.001), the difference in distribution is very large (r= 0.95).