#First, I imported the insurance dataset in R
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)
'data.frame':   1338 obs. of  7 variables:
 $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
 $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
 $ bmi     : num  27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
 $ children: int  0 1 3 0 0 0 1 3 2 0 ...
 $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
 $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
 $ expenses: num  16885 1726 4449 21984 3867 ...
#I did the first stepts of activity 7, then used the MLR solution that we built during class 

# summarize the charges variable
summary(insurance$expenses)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1122    4740    9382   13270   16640   63770 
# histogram of insurance charges
hist(insurance$expenses)


# table of region
table(insurance$region)

northeast northwest southeast southwest 
      324       325       364       325 
# exploring relationships among features: correlation matrix
cor(insurance[c("age", "bmi", "children", "expenses")])
               age        bmi   children   expenses
age      1.0000000 0.10934101 0.04246900 0.29900819
bmi      0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
                data = insurance)
ins_model <- lm(expenses ~ ., data = insurance) # this is equivalent to above

# add a higher-order "age" term
insurance$age2 <- insurance$age^2
# add an indicator for BMI >= 30
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
# create final model
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
                   bmi30*smoker + region, data = insurance)

ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
                   bmi30*smoker + region, data = insurance)

# Predict for Case 1
case1 <- predict(ins_model2,
        data.frame(age = 22, age2 = 22^2, children = 3,
                   bmi = 24, sex = "female", bmi30 = 0,
                   smoker = "no", region = "northwest"))

# Predict for Case 2
case2 <- predict(ins_model2,
        data.frame(age = 22, age2 = 22^2, children = 1,
                   bmi = 27, sex = "male", bmi30 = 0,
                   smoker = "yes", region = "southeast"))

# Print results
cat("Case 1 Prediction:", case1, "\n")
Case 1 Prediction: 5858.241 
cat("Case 2 Prediction:", case2, "\n")
Case 2 Prediction: 17219.31 
LS0tCnRpdGxlOiAiSW4tY2xhc3MgYWN0aXZpdHkgIzgiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCgpgYGB7cn0KI0ZpcnN0LCBJIGltcG9ydGVkIHRoZSBpbnN1cmFuY2UgZGF0YXNldCBpbiBSCmluc3VyYW5jZSA8LSByZWFkLmNzdigiaW5zdXJhbmNlLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBUUlVFKQpzdHIoaW5zdXJhbmNlKQoKI0kgZGlkIHRoZSBmaXJzdCBzdGVwdHMgb2YgYWN0aXZpdHkgNywgdGhlbiB1c2VkIHRoZSBNTFIgc29sdXRpb24gdGhhdCB3ZSBidWlsdCBkdXJpbmcgY2xhc3MgCgojIHN1bW1hcml6ZSB0aGUgY2hhcmdlcyB2YXJpYWJsZQpzdW1tYXJ5KGluc3VyYW5jZSRleHBlbnNlcykKCiMgaGlzdG9ncmFtIG9mIGluc3VyYW5jZSBjaGFyZ2VzCmhpc3QoaW5zdXJhbmNlJGV4cGVuc2VzKQoKIyB0YWJsZSBvZiByZWdpb24KdGFibGUoaW5zdXJhbmNlJHJlZ2lvbikKCiMgZXhwbG9yaW5nIHJlbGF0aW9uc2hpcHMgYW1vbmcgZmVhdHVyZXM6IGNvcnJlbGF0aW9uIG1hdHJpeApjb3IoaW5zdXJhbmNlW2MoImFnZSIsICJibWkiLCAiY2hpbGRyZW4iLCAiZXhwZW5zZXMiKV0pCgppbnNfbW9kZWwgPC0gbG0oZXhwZW5zZXMgfiBhZ2UgKyBjaGlsZHJlbiArIGJtaSArIHNleCArIHNtb2tlciArIHJlZ2lvbiwKICAgICAgICAgICAgICAgIGRhdGEgPSBpbnN1cmFuY2UpCmluc19tb2RlbCA8LSBsbShleHBlbnNlcyB+IC4sIGRhdGEgPSBpbnN1cmFuY2UpICMgdGhpcyBpcyBlcXVpdmFsZW50IHRvIGFib3ZlCgojIGFkZCBhIGhpZ2hlci1vcmRlciAiYWdlIiB0ZXJtCmluc3VyYW5jZSRhZ2UyIDwtIGluc3VyYW5jZSRhZ2VeMgojIGFkZCBhbiBpbmRpY2F0b3IgZm9yIEJNSSA+PSAzMAppbnN1cmFuY2UkYm1pMzAgPC0gaWZlbHNlKGluc3VyYW5jZSRibWkgPj0gMzAsIDEsIDApCiMgY3JlYXRlIGZpbmFsIG1vZGVsCmluc19tb2RlbDIgPC0gbG0oZXhwZW5zZXMgfiBhZ2UgKyBhZ2UyICsgY2hpbGRyZW4gKyBibWkgKyBzZXggKwogICAgICAgICAgICAgICAgICAgYm1pMzAqc21va2VyICsgcmVnaW9uLCBkYXRhID0gaW5zdXJhbmNlKQoKaW5zX21vZGVsMiA8LSBsbShleHBlbnNlcyB+IGFnZSArIGFnZTIgKyBjaGlsZHJlbiArIGJtaSArIHNleCArCiAgICAgICAgICAgICAgICAgICBibWkzMCpzbW9rZXIgKyByZWdpb24sIGRhdGEgPSBpbnN1cmFuY2UpCgojIFByZWRpY3QgZm9yIENhc2UgMQpjYXNlMSA8LSBwcmVkaWN0KGluc19tb2RlbDIsCiAgICAgICAgZGF0YS5mcmFtZShhZ2UgPSAyMiwgYWdlMiA9IDIyXjIsIGNoaWxkcmVuID0gMywKICAgICAgICAgICAgICAgICAgIGJtaSA9IDI0LCBzZXggPSAiZmVtYWxlIiwgYm1pMzAgPSAwLAogICAgICAgICAgICAgICAgICAgc21va2VyID0gIm5vIiwgcmVnaW9uID0gIm5vcnRod2VzdCIpKQoKIyBQcmVkaWN0IGZvciBDYXNlIDIKY2FzZTIgPC0gcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMjIsIGFnZTIgPSAyMl4yLCBjaGlsZHJlbiA9IDEsCiAgICAgICAgICAgICAgICAgICBibWkgPSAyNywgc2V4ID0gIm1hbGUiLCBibWkzMCA9IDAsCiAgICAgICAgICAgICAgICAgICBzbW9rZXIgPSAieWVzIiwgcmVnaW9uID0gInNvdXRoZWFzdCIpKQoKIyBQcmludCByZXN1bHRzCmNhdCgiQ2FzZSAxIFByZWRpY3Rpb246IiwgY2FzZTEsICJcbiIpCmNhdCgiQ2FzZSAyIFByZWRpY3Rpb246IiwgY2FzZTIsICJcbiIpCmBgYAoK