Importing the Dataset

df = read.csv('dataset.csv', header = TRUE)
attach(df)
names(df)
## [1] "Charges"  "Age"      "Gender"   "BMI"      "children" "Smoking" 
## [7] "Region"
head(df)
##   Charges Age Gender    BMI children Smoking    Region
## 1   16885  19 female 27.900        0     yes southwest
## 2    1726  18   male 33.770        1      no southeast
## 3    4449  28   male 33.000        3      no southeast
## 4   21984  33   male 22.705        0      no northwest
## 5    3867  32   male 28.880        0      no northwest
## 6    3757  31 female 25.740        0      no southeast
summary(df)
##     Charges           Age           Gender         BMI       
##  Min.   : 1122   Min.   :18.00   female:662   Min.   :15.96  
##  1st Qu.: 4740   1st Qu.:27.00   male  :676   1st Qu.:26.30  
##  Median : 9382   Median :39.00                Median :30.40  
##  Mean   :13270   Mean   :39.21                Mean   :30.66  
##  3rd Qu.:16640   3rd Qu.:51.00                3rd Qu.:34.69  
##  Max.   :63770   Max.   :64.00                Max.   :53.13  
##     children     Smoking          Region   
##  Min.   :0.000   no :1064   northeast:324  
##  1st Qu.:0.000   yes: 274   northwest:325  
##  Median :1.000              southeast:364  
##  Mean   :1.095              southwest:325  
##  3rd Qu.:2.000                             
##  Max.   :5.000
hist(Charges, ylab = "Frequency", col = "blue",breaks = 30)

library(tidyverse)
## ── Attaching packages ── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.0  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.2       ✔ stringr 1.3.1  
## ✔ readr   1.3.1       ✔ forcats 0.3.0
## ── Conflicts ───── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
ggplot(df,aes(x = Charges, color = Smoking))+
  geom_histogram(fill = "white", position = "dodge")+
  theme(legend.position = "top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df)+
  geom_violin(mapping = aes(x = Smoking, y = Charges, fill = Gender), draw_quantiles = (c(0.25,0.5,0.75)))

ggplot(df)+
  geom_point(mapping = aes(x = Age, y = Charges, colour = Smoking))

ggplot(df,aes(x = BMI))+
  geom_histogram(fill = "blue", position = "dodge")+
  theme(legend.position = "top")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df)+
  geom_point(mapping = aes(x = BMI, y = Charges))

bg<-ggplot(df, aes(children))
bg+geom_bar(fill = "blue")

p<-ggplot(df, aes(children,y = Charges, group = children))
p+geom_boxplot(fill = "white", color = "blue")

df$Gender = as.numeric(df$Gender)
df$Smoking = as.numeric(df$Smoking)
df$Region = as.numeric(df$Region)
library(corrplot)
## corrplot 0.84 loaded
costCorr<-cor(df)
corrplot(costCorr, method = "pie")

Before we perform independent two-sample T-test to check the mean difference in charges for smokers and non-smokers, we will look at the boxplot of those variables. The boxplot shows that smokers and non-smokers have unequal variances, so in the t-test we will set the parameter as unequal variances.

boxplot(Charges ~ Smoking)

Let’s perform the t-test. 1. Our Null Hypothesis is that Charges of smokers and non-smokers have equal means. 2. Alternative Hypothesis is Charges for smokers and non-smokers’ means are not equal. 3. We perform 2 sided test, because we have equal sign. 4. We take confidence interval of 95%.

?t.test
t.test(Charges~Smoking, mu = 0, alt = "two.sided",conf.level = 0.95, var.equql = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Charges by Smoking
## t = -32.752, df = 311.85, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -25034.72 -22197.22
## sample estimates:
##  mean in group no mean in group yes 
##          8434.266         32050.234
boxplot(Charges~Gender)

t.test(Charges~Gender, mu = 0, alt = "two.sided",conf.level = 0.95, var.equql = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Charges by Gender
## t = -2.1009, df = 1313.4, p-value = 0.03584
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2682.46911   -91.83495
## sample estimates:
## mean in group female   mean in group male 
##             12569.59             13956.74

One-way ANOVA, Region independent variable.

df$Region = as.factor(df$Region)
levels(Region)
## [1] "northeast" "northwest" "southeast" "southwest"
df = read.csv('dataset.csv', header = TRUE)
reg<-ggplot(df,aes(x = Region, y = Charges, fill = Region))
reg + geom_boxplot()

One-Way anova for Region Variable

oneWay<-aov(Charges~Region)
summary(oneWay)
##               Df    Sum Sq   Mean Sq F value Pr(>F)  
## Region         3 1.301e+09 433589497    2.97 0.0309 *
## Residuals   1334 1.948e+11 146007098                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Multiple Linear Regression First Model. In the first model, we take three independent variables, Age, BMI and Smoking.

fit<-lm(Charges~Age + BMI + Smoking)
summary(fit)
## 
## Call:
## lm(formula = Charges ~ Age + BMI + Smoking)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12415.1  -2971.2   -980.1   1480.3  28971.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11676.84     937.57  -12.45   <2e-16 ***
## Age            259.55      11.93   21.75   <2e-16 ***
## BMI            322.61      27.49   11.74   <2e-16 ***
## Smokingyes   23823.69     412.87   57.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6092 on 1334 degrees of freedom
## Multiple R-squared:  0.7475, Adjusted R-squared:  0.7469 
## F-statistic:  1316 on 3 and 1334 DF,  p-value: < 2.2e-16
plot(fit)

car::vif(fit)
##      Age      BMI  Smoking 
## 1.012747 1.012128 1.000669
car::vif(fit)
##      Age      BMI  Smoking 
## 1.012747 1.012128 1.000669
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
sresid<-studres(fit)
hist(sresid, main = "Distribution of Studentized Residuals",breaks = 40, xlab = "Studentized Residuals")

fit<-lm(Charges~Age + BMI + Smoking)
fit2<-lm(Charges~Age+BMI+Smoking+children)
anova(fit,fit2)
## Analysis of Variance Table
## 
## Model 1: Charges ~ Age + BMI + Smoking
## Model 2: Charges ~ Age + BMI + Smoking + children
##   Res.Df        RSS Df Sum of Sq      F    Pr(>F)    
## 1   1334 4.9513e+10                                  
## 2   1333 4.9078e+10  1 434747446 11.808 0.0006079 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit2<-lm(Charges~Age+BMI+Smoking+children)
summary(fit2)
## 
## Call:
## lm(formula = Charges ~ Age + BMI + Smoking + children)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11897.7  -2920.9   -986.4   1392.1  29509.7 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12102.76     941.98 -12.848  < 2e-16 ***
## Age            257.85      11.90  21.675  < 2e-16 ***
## BMI            321.85      27.38  11.756  < 2e-16 ***
## Smokingyes   23811.40     411.22  57.904  < 2e-16 ***
## children       473.49     137.79   3.436 0.000608 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6068 on 1333 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.7489 
## F-statistic: 998.1 on 4 and 1333 DF,  p-value: < 2.2e-16