Expplore and prep the data {r} #stringAsFactors = True will change strings as factors insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE) str(insurance) {r} # summarize the charges variable. Here we can see the min, median, mean, max and 1st quartile and 3rd quartile . Here we are summarzing the expneses the $ between insurance and expense allows us to choose the column expenses summary(insurance$expenses)

```{r} # histogram of insurance charges #Here we can visualize the frequnecy of different values or ranges of values within the dataset. We can see #how spread the datat is spread out across the different values (expenses) within the insurance dataset. #In this datset, expenses are the responsive variable. This is why we choose to select the expenses variable. hist(insurance$expenses)


```{r}
# table of region
#Here we create a table with the regions in the dataset. As mentioned earlier, 
# the $ allows us to choose the column from the dataset. We can use tables in R to create
#frequency tables to summarize the count of occurrences of each category in categorial data
table(insurance$region)

{r} # exploring relationships among features: correlation matrix cor(insurance[c("age", "bmi", "children", "expenses")])

```{r} # visualing relationships among features: scatterplot matrix # If the points tend to rise from left to right (an upward slope), this indicates a positive correlation. As one variable increases, the other variable tends to increase as well. #If the points tend to fall from left to right (a downward slope), this indicates a negative correlation. As one variable increases, the other tends to decrease. #If the points are scattered randomly, with no clear trend, this suggests that there is no correlation between the two variables.

pairs(insurance[c(“age”, “bmi”, “children”, “expenses”)])

```{r}
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
                data = insurance)
ins_model <- lm(expenses ~ ., data = insurance) # this is equivalent to above

# see the estimated beta coefficients
ins_model

{r} #Step 4 model of performance # see more detail about the estimated beta coefficients #A higher absolute t-value indicates stronger evidence against the null hypothesis. It plays a vital #role in determining statistical significance in hypothesis testing. summary(ins_model) {r} ## Step 5: Improving model performance # add a higher-order "age" term #the relationship between age and cost is not linear, fitting a quadratic term age^2 allows the model to fit the data better and potentially reduce the residual error. insurance$age2 <- insurance$age^2

{r} # add an indicator for BMI >= 30 #we are adding a column called bmi30 #condition: insurance$bmi >= 30 checks if the BMI is greater than or equal to 30. #value_if_true: 1 is assigned if the condition (bmi >= 30) is TRUE (i.e., the person has a BMI of 30 or more). #value_if_false: 0 is assigned if the condition is FALSE (i.e., the person has a BMI less than 30). insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)

{r} # create final model ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex + bmi30*smoker + region, data = insurance)

{r} summary(ins_model2) {r} # making predictions with the regression model insurance$pred <- predict(ins_model2, insurance) cor(insurance$pred, insurance$expenses)

{r} plot(insurance$pred, insurance$expenses) abline(a = 0, b = 1, col = "blue", lwd = 3, lty = 2)

Use the MLR solution built during class to predict the insurance quote given the following case scenarios:

Case 1: Age=22, Children=1,bmi=27,sex=male,bmi30=0,smoker=yes, region=Southeast. {r} #Case 1: Age=22, Children=3,bmi=24,sex=female,bmi30=0,smoker=no, region=Northwest. predict(ins_model2, data.frame(age = 22, age2 = 30^2, children = 3, bmi = 24, sex = "female", bmi30 = 0, smoker = "no", region = "northeast"))

{r} #Case 2: Age=22, Children=1,bmi=27,sex=male,bmi30=0,smoker=yes, region=Southeast. predict(ins_model2, data.frame(age = 22, age2 = 30^2, children = 1, bmi = 27, sex = "male", bmi30 = 0, smoker = "yes", region = "southeast")) Based on the predictions in Case 1 versus Case 2, we can infer that the cost is likely to be higher for a male living in the South who is a smoker and has a higher BMI, compared to a female living in the Northeast who is a non-smoker with a lower BMI, even though she has more children.

LS0tCnRpdGxlOiAiSW5zdXJhbmNlIE1MUiIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKRXhwcGxvcmUgYW5kIHByZXAgdGhlIGRhdGEgCmBgYHtyfQojc3RyaW5nQXNGYWN0b3JzID0gVHJ1ZSB3aWxsIGNoYW5nZSBzdHJpbmdzIGFzIGZhY3RvcnMgCmluc3VyYW5jZSA8LSByZWFkLmNzdigiaW5zdXJhbmNlLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBUUlVFKQpzdHIoaW5zdXJhbmNlKQpgYGAKYGBge3J9CiMgc3VtbWFyaXplIHRoZSBjaGFyZ2VzIHZhcmlhYmxlLiBIZXJlIHdlIGNhbiBzZWUgdGhlIG1pbiwgbWVkaWFuLCBtZWFuLCBtYXggYW5kIDFzdCBxdWFydGlsZSBhbmQgM3JkIHF1YXJ0aWxlIC4gSGVyZSB3ZSBhcmUgc3VtbWFyemluZyB0aGUgZXhwbmVzZXMgdGhlICQgYmV0d2VlbiBpbnN1cmFuY2UgYW5kIGV4cGVuc2UgYWxsb3dzIHVzIHRvIGNob29zZSB0aGUgY29sdW1uIGV4cGVuc2VzCnN1bW1hcnkoaW5zdXJhbmNlJGV4cGVuc2VzKQpgYGAKCmBgYHtyfQojIGhpc3RvZ3JhbSBvZiBpbnN1cmFuY2UgY2hhcmdlcwojSGVyZSB3ZSBjYW4gdmlzdWFsaXplIHRoZSBmcmVxdW5lY3kgb2YgZGlmZmVyZW50IHZhbHVlcyBvciByYW5nZXMgb2YgdmFsdWVzIHdpdGhpbiB0aGUgZGF0YXNldC4gV2UgY2FuIHNlZQojaG93IHNwcmVhZCB0aGUgZGF0YXQgaXMgc3ByZWFkIG91dCBhY3Jvc3MgdGhlIGRpZmZlcmVudCB2YWx1ZXMgKGV4cGVuc2VzKSB3aXRoaW4gdGhlIGluc3VyYW5jZSBkYXRhc2V0LgojSW4gdGhpcyBkYXRzZXQsIGV4cGVuc2VzIGFyZSB0aGUgcmVzcG9uc2l2ZSB2YXJpYWJsZS4gVGhpcyBpcyB3aHkgd2UgY2hvb3NlIHRvIHNlbGVjdCB0aGUgZXhwZW5zZXMgdmFyaWFibGUuCmhpc3QoaW5zdXJhbmNlJGV4cGVuc2VzKQoKYGBgCgpgYGB7cn0KIyB0YWJsZSBvZiByZWdpb24KI0hlcmUgd2UgY3JlYXRlIGEgdGFibGUgd2l0aCB0aGUgcmVnaW9ucyBpbiB0aGUgZGF0YXNldC4gQXMgbWVudGlvbmVkIGVhcmxpZXIsIAojIHRoZSAkIGFsbG93cyB1cyB0byBjaG9vc2UgdGhlIGNvbHVtbiBmcm9tIHRoZSBkYXRhc2V0LiBXZSBjYW4gdXNlIHRhYmxlcyBpbiBSIHRvIGNyZWF0ZQojZnJlcXVlbmN5IHRhYmxlcyB0byBzdW1tYXJpemUgdGhlIGNvdW50IG9mIG9jY3VycmVuY2VzIG9mIGVhY2ggY2F0ZWdvcnkgaW4gY2F0ZWdvcmlhbCBkYXRhCnRhYmxlKGluc3VyYW5jZSRyZWdpb24pCmBgYAoKYGBge3J9CiMgZXhwbG9yaW5nIHJlbGF0aW9uc2hpcHMgYW1vbmcgZmVhdHVyZXM6IGNvcnJlbGF0aW9uIG1hdHJpeApjb3IoaW5zdXJhbmNlW2MoImFnZSIsICJibWkiLCAiY2hpbGRyZW4iLCAiZXhwZW5zZXMiKV0pCmBgYAoKYGBge3J9CiMgdmlzdWFsaW5nIHJlbGF0aW9uc2hpcHMgYW1vbmcgZmVhdHVyZXM6IHNjYXR0ZXJwbG90IG1hdHJpeAojIElmIHRoZSBwb2ludHMgdGVuZCB0byByaXNlIGZyb20gbGVmdCB0byByaWdodCAoYW4gdXB3YXJkIHNsb3BlKSwgdGhpcyBpbmRpY2F0ZXMgYSBwb3NpdGl2ZSBjb3JyZWxhdGlvbi4gQXMgb25lIHZhcmlhYmxlIGluY3JlYXNlcywgdGhlIG90aGVyIHZhcmlhYmxlIHRlbmRzIHRvIGluY3JlYXNlIGFzIHdlbGwuCiNJZiB0aGUgcG9pbnRzIHRlbmQgdG8gZmFsbCBmcm9tIGxlZnQgdG8gcmlnaHQgKGEgZG93bndhcmQgc2xvcGUpLCB0aGlzIGluZGljYXRlcyBhIG5lZ2F0aXZlIGNvcnJlbGF0aW9uLiBBcyBvbmUgdmFyaWFibGUgaW5jcmVhc2VzLCB0aGUgb3RoZXIgdGVuZHMgdG8gZGVjcmVhc2UuCiNJZiB0aGUgcG9pbnRzIGFyZSBzY2F0dGVyZWQgcmFuZG9tbHksIHdpdGggbm8gY2xlYXIgdHJlbmQsIHRoaXMgc3VnZ2VzdHMgdGhhdCB0aGVyZSBpcyBubyBjb3JyZWxhdGlvbiBiZXR3ZWVuIHRoZSB0d28gdmFyaWFibGVzLgoKcGFpcnMoaW5zdXJhbmNlW2MoImFnZSIsICJibWkiLCAiY2hpbGRyZW4iLCAiZXhwZW5zZXMiKV0pCmBgYApgYGB7cn0KaW5zX21vZGVsIDwtIGxtKGV4cGVuc2VzIH4gYWdlICsgY2hpbGRyZW4gKyBibWkgKyBzZXggKyBzbW9rZXIgKyByZWdpb24sCiAgICAgICAgICAgICAgICBkYXRhID0gaW5zdXJhbmNlKQppbnNfbW9kZWwgPC0gbG0oZXhwZW5zZXMgfiAuLCBkYXRhID0gaW5zdXJhbmNlKSAjIHRoaXMgaXMgZXF1aXZhbGVudCB0byBhYm92ZQoKIyBzZWUgdGhlIGVzdGltYXRlZCBiZXRhIGNvZWZmaWNpZW50cwppbnNfbW9kZWwKYGBgCgpgYGB7cn0KI1N0ZXAgNCBtb2RlbCBvZiBwZXJmb3JtYW5jZSAKIyBzZWUgbW9yZSBkZXRhaWwgYWJvdXQgdGhlIGVzdGltYXRlZCBiZXRhIGNvZWZmaWNpZW50cwojQSBoaWdoZXIgYWJzb2x1dGUgdC12YWx1ZSBpbmRpY2F0ZXMgc3Ryb25nZXIgZXZpZGVuY2UgYWdhaW5zdCB0aGUgbnVsbCBoeXBvdGhlc2lzLiBJdCBwbGF5cyBhIHZpdGFsCiNyb2xlIGluIGRldGVybWluaW5nIHN0YXRpc3RpY2FsIHNpZ25pZmljYW5jZSBpbiBoeXBvdGhlc2lzIHRlc3RpbmcuCnN1bW1hcnkoaW5zX21vZGVsKQpgYGAKYGBge3J9CiMjIFN0ZXAgNTogSW1wcm92aW5nIG1vZGVsIHBlcmZvcm1hbmNlCiMgYWRkIGEgaGlnaGVyLW9yZGVyICJhZ2UiIHRlcm0gCiN0aGUgcmVsYXRpb25zaGlwIGJldHdlZW4gYWdlIGFuZCBjb3N0IGlzIG5vdCBsaW5lYXIsIGZpdHRpbmcgYSBxdWFkcmF0aWMgdGVybSBhZ2VeMiBhbGxvd3MgdGhlIG1vZGVsIHRvIGZpdCB0aGUgZGF0YSBiZXR0ZXIgYW5kIHBvdGVudGlhbGx5IHJlZHVjZSB0aGUgcmVzaWR1YWwgZXJyb3IuCmluc3VyYW5jZSRhZ2UyIDwtIGluc3VyYW5jZSRhZ2VeMgpgYGAKCgpgYGB7cn0KIyBhZGQgYW4gaW5kaWNhdG9yIGZvciBCTUkgPj0gMzAKI3dlIGFyZSBhZGRpbmcgYSBjb2x1bW4gY2FsbGVkIGJtaTMwCiNjb25kaXRpb246IGluc3VyYW5jZSRibWkgPj0gMzAgY2hlY2tzIGlmIHRoZSBCTUkgaXMgZ3JlYXRlciB0aGFuIG9yIGVxdWFsIHRvIDMwLgojdmFsdWVfaWZfdHJ1ZTogMSBpcyBhc3NpZ25lZCBpZiB0aGUgY29uZGl0aW9uIChibWkgPj0gMzApIGlzIFRSVUUgKGkuZS4sIHRoZSBwZXJzb24gaGFzIGEgQk1JIG9mIDMwIG9yIG1vcmUpLgojdmFsdWVfaWZfZmFsc2U6IDAgaXMgYXNzaWduZWQgaWYgdGhlIGNvbmRpdGlvbiBpcyBGQUxTRSAoaS5lLiwgdGhlIHBlcnNvbiBoYXMgYSBCTUkgbGVzcyB0aGFuIDMwKS4KaW5zdXJhbmNlJGJtaTMwIDwtIGlmZWxzZShpbnN1cmFuY2UkYm1pID49IDMwLCAxLCAwKQpgYGAKCmBgYHtyfQojIGNyZWF0ZSBmaW5hbCBtb2RlbAppbnNfbW9kZWwyIDwtIGxtKGV4cGVuc2VzIH4gYWdlICsgYWdlMiArIGNoaWxkcmVuICsgYm1pICsgc2V4ICsKICAgICAgICAgICAgICAgICAgIGJtaTMwKnNtb2tlciArIHJlZ2lvbiwgZGF0YSA9IGluc3VyYW5jZSkKYGBgCgpgYGB7cn0Kc3VtbWFyeShpbnNfbW9kZWwyKQpgYGAKYGBge3J9CiMgbWFraW5nIHByZWRpY3Rpb25zIHdpdGggdGhlIHJlZ3Jlc3Npb24gbW9kZWwKaW5zdXJhbmNlJHByZWQgPC0gcHJlZGljdChpbnNfbW9kZWwyLCBpbnN1cmFuY2UpCmNvcihpbnN1cmFuY2UkcHJlZCwgaW5zdXJhbmNlJGV4cGVuc2VzKQpgYGAKCmBgYHtyfQpwbG90KGluc3VyYW5jZSRwcmVkLCBpbnN1cmFuY2UkZXhwZW5zZXMpCmFibGluZShhID0gMCwgYiA9IDEsIGNvbCA9ICJibHVlIiwgbHdkID0gMywgbHR5ID0gMikKYGBgCgpVc2UgdGhlIE1MUiBzb2x1dGlvbiBidWlsdCBkdXJpbmcgY2xhc3MgdG8gcHJlZGljdCB0aGUgaW5zdXJhbmNlIHF1b3RlIGdpdmVuIHRoZSBmb2xsb3dpbmcgY2FzZSBzY2VuYXJpb3M6CgpDYXNlIDE6ICBBZ2U9MjIsIENoaWxkcmVuPTEsYm1pPTI3LHNleD1tYWxlLGJtaTMwPTAsc21va2VyPXllcywgcmVnaW9uPVNvdXRoZWFzdC4KYGBge3J9CiNDYXNlIDE6ICBBZ2U9MjIsIENoaWxkcmVuPTMsYm1pPTI0LHNleD1mZW1hbGUsYm1pMzA9MCxzbW9rZXI9bm8sIHJlZ2lvbj1Ob3J0aHdlc3QuCnByZWRpY3QoaW5zX21vZGVsMiwKICAgICAgICBkYXRhLmZyYW1lKGFnZSA9IDIyLCBhZ2UyID0gMzBeMiwgY2hpbGRyZW4gPSAzLAogICAgICAgICAgICAgICAgICAgYm1pID0gMjQsIHNleCA9ICJmZW1hbGUiLCBibWkzMCA9IDAsCiAgICAgICAgICAgICAgICAgICBzbW9rZXIgPSAibm8iLCByZWdpb24gPSAibm9ydGhlYXN0IikpCmBgYAoKYGBge3J9CiNDYXNlIDI6ICBBZ2U9MjIsIENoaWxkcmVuPTEsYm1pPTI3LHNleD1tYWxlLGJtaTMwPTAsc21va2VyPXllcywgcmVnaW9uPVNvdXRoZWFzdC4KcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMjIsIGFnZTIgPSAzMF4yLCBjaGlsZHJlbiA9IDEsCiAgICAgICAgICAgICAgICAgICBibWkgPSAyNywgc2V4ID0gIm1hbGUiLCBibWkzMCA9IDAsCiAgICAgICAgICAgICAgICAgICBzbW9rZXIgPSAieWVzIiwgcmVnaW9uID0gInNvdXRoZWFzdCIpKQpgYGAKQmFzZWQgb24gdGhlIHByZWRpY3Rpb25zIGluIENhc2UgMSB2ZXJzdXMgQ2FzZSAyLCB3ZSBjYW4gaW5mZXIgdGhhdCB0aGUgY29zdCBpcyBsaWtlbHkgdG8gYmUgaGlnaGVyIGZvciBhIG1hbGUgbGl2aW5nIGluIHRoZSBTb3V0aCB3aG8gaXMgYSBzbW9rZXIgYW5kIGhhcyBhIGhpZ2hlciBCTUksIGNvbXBhcmVkIHRvIGEgZmVtYWxlIGxpdmluZyBpbiB0aGUgTm9ydGhlYXN0IHdobyBpcyBhIG5vbi1zbW9rZXIgd2l0aCBhIGxvd2VyIEJNSSwgZXZlbiB0aG91Z2ggc2hlIGhhcyBtb3JlIGNoaWxkcmVuLg==