R Markdown

statistiscal analysis.

data=read.csv("stadataset.csv")
head(data)
##   age    sex    bmi children smoker    region   charges
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622
names(data)
## [1] "age"      "sex"      "bmi"      "children" "smoker"   "region"   "charges"
summary(data)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
str(data)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
n=length(data)
n
## [1] 7

one sample test

mean(data$bmi)
## [1] 30.6634
sd(data$bmi)
## [1] 6.098187
hist(data$bmi,prob=TRUE)

var(data$bmi)
## [1] 37.18788
x=sample(data$bmi,300)
t.test(x,mu=49)
## 
##  One Sample t-test
## 
## data:  x
## t = -53.14, df = 299, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 49
## 95 percent confidence interval:
##  29.68873 31.06797
## sample estimates:
## mean of x 
##  30.37835

from the above result it is observed that p-value 2.2e^-16 is more th

two sample test

x=sample(data$charges,100)
y=sample(data$charges,100)
t.test(x,y,mu=.5,var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  x and y
## t = 0.93439, df = 198, p-value = 0.3512
## alternative hypothesis: true difference in means is not equal to 0.5
## 95 percent confidence interval:
##  -1658.611  4647.667
## sample estimates:
## mean of x mean of y 
##  12920.09  11425.56

pair test

x=sample(data$age,100)
y=sample(data$bmi,100)
t.test(x,y,paired= TRUE)
## 
##  Paired t-test
## 
## data:  x and y
## t = 6.191, df = 99, p-value = 1.363e-08
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##   6.483081 12.598819
## sample estimates:
## mean difference 
##         9.54095

correlation test

cor.test(data$age,data$bmi)
## 
##  Pearson's product-moment correlation
## 
## data:  data$age and data$bmi
## t = 4.0181, df = 1336, p-value = 6.194e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.05600895 0.16191463
## sample estimates:
##       cor 
## 0.1092719