R Markdown
statistiscal analysis.
data=read.csv("stadataset.csv")
head(data)
## age sex bmi children smoker region charges
## 1 19 female 27.900 0 yes southwest 16884.924
## 2 18 male 33.770 1 no southeast 1725.552
## 3 28 male 33.000 3 no southeast 4449.462
## 4 33 male 22.705 0 no northwest 21984.471
## 5 32 male 28.880 0 no northwest 3866.855
## 6 31 female 25.740 0 no southeast 3756.622
names(data)
## [1] "age" "sex" "bmi" "children" "smoker" "region" "charges"
summary(data)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
str(data)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
n=length(data)
n
## [1] 7
one sample test
mean(data$bmi)
## [1] 30.6634
sd(data$bmi)
## [1] 6.098187
hist(data$bmi,prob=TRUE)

var(data$bmi)
## [1] 37.18788
x=sample(data$bmi,300)
t.test(x,mu=49)
##
## One Sample t-test
##
## data: x
## t = -53.14, df = 299, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 49
## 95 percent confidence interval:
## 29.68873 31.06797
## sample estimates:
## mean of x
## 30.37835
from the above result it is observed that p-value 2.2e^-16 is more
th
two sample test
x=sample(data$charges,100)
y=sample(data$charges,100)
t.test(x,y,mu=.5,var.equal = TRUE)
##
## Two Sample t-test
##
## data: x and y
## t = 0.93439, df = 198, p-value = 0.3512
## alternative hypothesis: true difference in means is not equal to 0.5
## 95 percent confidence interval:
## -1658.611 4647.667
## sample estimates:
## mean of x mean of y
## 12920.09 11425.56
pair test
x=sample(data$age,100)
y=sample(data$bmi,100)
t.test(x,y,paired= TRUE)
##
## Paired t-test
##
## data: x and y
## t = 6.191, df = 99, p-value = 1.363e-08
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 6.483081 12.598819
## sample estimates:
## mean difference
## 9.54095
correlation test
cor.test(data$age,data$bmi)
##
## Pearson's product-moment correlation
##
## data: data$age and data$bmi
## t = 4.0181, df = 1336, p-value = 6.194e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.05600895 0.16191463
## sample estimates:
## cor
## 0.1092719