### reading all libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

### Lets create a dummy dataset assuming we are predicting y with variable x
# Create the data frame with the given data
data <- read.csv("Life Expectancy Data.csv")

# View the dataframe

#view data 
#head(data)

data <- data[,c("Life.expectancy","Alcohol")]
data <- na.omit(data)
data <- data %>% rename(y=Life.expectancy,x=Alcohol)
head(data)

##      y    x
## 1 65.0 0.01
## 2 59.9 0.01
## 3 59.9 0.01
## 4 59.5 0.01
## 5 59.2 0.01
## 6 58.8 0.01

Now fitting the model for y=Bo + B1x

model <- lm(y~x,data = data)

Lets print our model

print(model)

## 
## Call:
## lm(formula = y ~ x, data = data)
## 
## Coefficients:
## (Intercept)            x  
##     64.7633       0.9546

We can see that from fitting our model ,we got Bo as -0.48 and B1 as 0.00696, that means our equation becomes

Y = 64.7633 + 0.9546 * X

That means every 1 unit increase in X then the predicted Y level will increase by 0.9546

Model Adequacy check

plot(data$x,data$y, main = "Scatter plot ")
abline(model)

We can see data is not quite normally distributed , but we can here loosely assume its normally distributed

Lets check for constant variance

plot(model)

we can see that resiudual vs fitted plot that its not evenly distributed around 0 hence we know its non constanst variance

Hence lets do box cox first and convert it to constant variance

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

trans <- boxcox(model)

lambda <- trans$x
likelihood <- trans$y
f <- lambda[which.max(likelihood)]
f

## [1] 2

we can see that lambda is 2 , that means the formula

y_transf = (y^lambda - 1) / lambda

data<- na.omit(data)
data$y_trasnf <-((data$y)^lambda - 1)/ lambda

## Warning in (data$y)^lambda: longer object length is not a multiple of shorter
## object length

## Warning in ((data$y)^lambda - 1)/lambda: longer object length is not a multiple
## of shorter object length

## lets now check for model plot and see residual vs fitted plot
model2 <- lm(data$y_trasnf~data$x)

plot(model2)

we can stil see Residuals are clustered near 0 for lower fitted values but spread out widely for higher fitted values, hence box cox didnt fully solve our issue

lets try log transformation

data$ylog<- log(data$y)

model3<- lm(data$ylog~data$x)

plot(model3)

now we can see it better stabalizing the variance above and below line zero , lets see if our model is significant or not

Hence lets rewrite all our model formula

model3

## 
## Call:
## lm(formula = data$ylog ~ data$x)
## 
## Coefficients:
## (Intercept)       data$x  
##     4.16235      0.01382

Hence our final regression model is

\[ y = 4.16 + 0.01382x \]

tthat means for every one unit incrase in x the y value will increase by 0.0138 (this value will be in log terms of y as we log transformed )

Lets check now if our model is significant or not

 summary(model3)

## 
## Call:
## lm(formula = data$ylog ~ data$x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65016 -0.05988  0.03145  0.09793  0.28213 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.1623508  0.0039650 1049.79   <2e-16 ***
## data$x      0.0138234  0.0006458   21.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1368 on 2733 degrees of freedom
## Multiple R-squared:  0.1436, Adjusted R-squared:  0.1433 
## F-statistic: 458.2 on 1 and 2733 DF,  p-value: < 2.2e-16

We can see the p value is less than 0.05 hence we can say that we reject null hypothesis and confirm that x and y has relation and this regression model is significant

Simple Linear Regression

Sujit

2025-03-10

Now fitting the model for y=Bo + B1x

Lets print our model

We can see that from fitting our model ,we got Bo as -0.48 and B1 as 0.00696, that means our equation becomes

Y = 64.7633 + 0.9546 * X

That means every 1 unit increase in X then the predicted Y level will increase by 0.9546

Model Adequacy check

We can see data is not quite normally distributed , but we can here loosely assume its normally distributed

Lets check for constant variance

we can see that resiudual vs fitted plot that its not evenly distributed around 0 hence we know its non constanst variance

Hence lets do box cox first and convert it to constant variance

we can see that lambda is 2 , that means the formula

y_transf = (y^lambda - 1) / lambda

we can stil see Residuals are clustered near 0 for lower fitted values but spread out widely for higher fitted values, hence box cox didnt fully solve our issue

lets try log transformation

now we can see it better stabalizing the variance above and below line zero , lets see if our model is significant or not

Hence lets rewrite all our model formula

Hence our final regression model is

\[ y = 4.16 + 0.01382x \]

tthat means for every one unit incrase in x the y value will increase by 0.0138 (this value will be in log terms of y as we log transformed )

Lets check now if our model is significant or not

We can see the p value is less than 0.05 hence we can say that we reject null hypothesis and confirm that x and y has relation and this regression model is significant