This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
ageandheight <- read_excel("C:/GGTUAN/DREAMS/Yankee/TSU/MSc_TSU/Spring_2024/CS-583 Data Minning/Project_Data/AgeHeight.xlsx", sheet="Hoja")
ageandheight
## # A tibble: 12 × 3
## age height no_siblings
## <dbl> <dbl> <dbl>
## 1 18 76.1 1
## 2 19 77 2
## 3 20 78.1 3
## 4 21 78.2 2
## 5 22 78.8 0
## 6 23 79.7 1
## 7 24 79.9 5
## 8 25 81.1 0
## 9 26 81.2 1
## 10 27 81.8 4
## 11 28 82.8 1
## 12 29 83.5 5
lmheight <- lm(height ~ age, data=ageandheight)
summary(lmheight)
##
## Call:
## lm(formula = height ~ age, data = ageandheight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27238 -0.24248 -0.02762 0.16014 0.47238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.9283 0.5084 127.71 < 2e-16 ***
## age 0.6350 0.0214 29.66 4.43e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.256 on 10 degrees of freedom
## Multiple R-squared: 0.9888, Adjusted R-squared: 0.9876
## F-statistic: 880 on 1 and 10 DF, p-value: 4.428e-11
lmheight2 <- lm(height ~ age + no_siblings, data=ageandheight)
summary(lmheight2)
##
## Call:
## lm(formula = height ~ age + no_siblings, data = ageandheight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26297 -0.22462 -0.02021 0.16102 0.49752
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.90554 0.53526 121.260 8.96e-16 ***
## age 0.63751 0.02340 27.249 5.85e-10 ***
## no_siblings -0.01772 0.04735 -0.374 0.717
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2677 on 9 degrees of freedom
## Multiple R-squared: 0.9889, Adjusted R-squared: 0.9865
## F-statistic: 402.2 on 2 and 9 DF, p-value: 1.576e-09
library(readr)
insurance <- read_csv("C:/GGTUAN/DREAMS/Yankee/TSU/MSc_TSU/Spring_2024/CS-583 Data Minning/Project_Data/insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(insurance)
insurance2 <- read.csv("C:/GGTUAN/DREAMS/Yankee/TSU/MSc_TSU/Spring_2024/CS-583 Data Minning/Project_Data/insurance.csv", stringsAsFactors=TRUE)
View(insurance2)
str(insurance)
## spc_tbl_ [1,338 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr [1:1338] "female" "male" "male" "male" ...
## $ bmi : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
## $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr [1:1338] "yes" "no" "no" "no" ...
## $ region : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_character(),
## .. bmi = col_double(),
## .. children = col_double(),
## .. smoker = col_character(),
## .. region = col_character(),
## .. charges = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(insurance2)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
summary(insurance2)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
hist(insurance2$charges)
table(insurance2$region)
##
## northeast northwest southeast southwest
## 324 325 364 325
prop.table(table(insurance2$region))
##
## northeast northwest southeast southwest
## 0.2421525 0.2428999 0.2720478 0.2428999
table(insurance2$sex)
##
## female male
## 662 676
prop.table(table(insurance2$sex))
##
## female male
## 0.4947683 0.5052317
table(insurance2$smoker)
##
## no yes
## 1064 274
prop.table(table(insurance2$smoker))
##
## no yes
## 0.7952167 0.2047833
cor(insurance2[c('age', 'bmi', 'children', 'charges')])
## age bmi children charges
## age 1.0000000 0.1092719 0.04246900 0.29900819
## bmi 0.1092719 1.0000000 0.01275890 0.19834097
## children 0.0424690 0.0127589 1.00000000 0.06799823
## charges 0.2990082 0.1983410 0.06799823 1.00000000
#insurance2.cor = cor(insurance2)
#corrplot(cor(insurance2[c('age', 'bmi', 'children', 'charges')]))
pairs(insurance2[c('age', 'bmi', 'children', 'charges')])
library(psych)
pairs.panels(insurance2[c('age', 'bmi', 'children', 'charges')])
ins_model <- lm(charges ~ age +children +bmi +sex +smoker +region , data=insurance2)
ins_model2 <- lm(charges ~ . , data=insurance2)
ins_model2
##
## Call:
## lm(formula = charges ~ ., data = insurance2)
##
## Coefficients:
## (Intercept) age sexmale bmi
## -11938.5 256.9 -131.3 339.2
## children smokeryes regionnorthwest regionsoutheast
## 475.5 23848.5 -353.0 -1035.0
## regionsouthwest
## -960.1
summary(ins_model)
##
## Call:
## lm(formula = charges ~ age + children + bmi + sex + smoker +
## region, data = insurance2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11938.5 987.8 -12.086 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## bmi 339.2 28.6 11.860 < 2e-16 ***
## sexmale -131.3 332.9 -0.394 0.693348
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16