### reading all libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
### Lets create a dummy dataset assuming we are predicting y with variable x
# Create the data frame with the given data
data <- read.csv("Life Expectancy Data.csv")

# View the dataframe

#view data 
#head(data)

data <- data[,c("Life.expectancy","Alcohol")]
data <- na.omit(data)
data <- data %>% rename(y=Life.expectancy,x=Alcohol)
head(data)
##      y    x
## 1 65.0 0.01
## 2 59.9 0.01
## 3 59.9 0.01
## 4 59.5 0.01
## 5 59.2 0.01
## 6 58.8 0.01

Now fitting the model for y=Bo + B1x

model <- lm(y~x,data = data)

Lets print our model

print(model)
## 
## Call:
## lm(formula = y ~ x, data = data)
## 
## Coefficients:
## (Intercept)            x  
##     64.7633       0.9546

We can see that from fitting our model ,we got Bo as -0.48 and B1 as 0.00696, that means our equation becomes

Y = 64.7633 + 0.9546 * X

That means every 1 unit increase in X then the predicted Y level will increase by 0.9546

Model Adequacy check

plot(data$x,data$y, main = "Scatter plot ")
abline(model)

We can see data is not quite normally distributed , but we can here loosely assume its normally distributed

Lets check for constant variance

plot(model)

we can see that resiudual vs fitted plot that its not evenly distributed around 0 hence we know its non constanst variance

Hence lets do box cox first and convert it to constant variance

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
trans <- boxcox(model)

lambda <- trans$x
likelihood <- trans$y
f <- lambda[which.max(likelihood)]
f
## [1] 2

we can see that lambda is 2 , that means the formula

y_transf = (y^lambda - 1) / lambda

data<- na.omit(data)
data$y_trasnf <-((data$y)^lambda - 1)/ lambda
## Warning in (data$y)^lambda: longer object length is not a multiple of shorter
## object length
## Warning in ((data$y)^lambda - 1)/lambda: longer object length is not a multiple
## of shorter object length
## lets now check for model plot and see residual vs fitted plot
model2 <- lm(data$y_trasnf~data$x)
plot(model2)

we can stil see Residuals are clustered near 0 for lower fitted values but spread out widely for higher fitted values, hence box cox didnt fully solve our issue

lets try log transformation

data$ylog<- log(data$y)
model3<- lm(data$ylog~data$x)
plot(model3)

now we can see it better stabalizing the variance above and below line zero , lets see if our model is significant or not

Hence lets rewrite all our model formula

model3
## 
## Call:
## lm(formula = data$ylog ~ data$x)
## 
## Coefficients:
## (Intercept)       data$x  
##     4.16235      0.01382

Hence our final regression model is

\[ y = 4.16 + 0.01382x \]

tthat means for every one unit incrase in x the y value will increase by 0.0138 (this value will be in log terms of y as we log transformed )

Lets check now if our model is significant or not

 summary(model3)
## 
## Call:
## lm(formula = data$ylog ~ data$x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65016 -0.05988  0.03145  0.09793  0.28213 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.1623508  0.0039650 1049.79   <2e-16 ***
## data$x      0.0138234  0.0006458   21.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1368 on 2733 degrees of freedom
## Multiple R-squared:  0.1436, Adjusted R-squared:  0.1433 
## F-statistic: 458.2 on 1 and 2733 DF,  p-value: < 2.2e-16

We can see the p value is less than 0.05 hence we can say that we reject null hypothesis and confirm that x and y has relation and this regression model is significant