install.packages("psych", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmppDJL37/downloaded_packages
library(psych)
install.packages("readr", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmppDJL37/downloaded_packages
library(readr)
install.packages("dplyr", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmppDJL37/downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Picking 2 datframes

df1 <- data("Loblolly")
df2 <- data("chickwts")
#adjust the Loblolly sample to fit chickwits sample 
realdf2 <-Loblolly[-(72:84),]
realdf2
## Grouped Data: height ~ age | Seed
##    height age Seed
## 1    4.51   3  301
## 15  10.89   5  301
## 29  28.72  10  301
## 43  41.74  15  301
## 57  52.70  20  301
## 71  60.92  25  301
## 2    4.55   3  303
## 16  10.92   5  303
## 30  29.07  10  303
## 44  42.83  15  303
## 58  53.88  20  303
## 72  63.39  25  303
## 3    4.79   3  305
## 17  11.37   5  305
## 31  30.21  10  305
## 45  44.40  15  305
## 59  55.82  20  305
## 73  64.10  25  305
## 4    3.91   3  307
## 18   9.48   5  307
## 32  25.66  10  307
## 46  39.07  15  307
## 60  50.78  20  307
## 74  59.07  25  307
## 5    4.81   3  309
## 19  11.20   5  309
## 33  28.66  10  309
## 47  41.66  15  309
## 61  53.31  20  309
## 75  63.05  25  309
## 6    3.88   3  311
## 20   9.40   5  311
## 34  25.99  10  311
## 48  39.55  15  311
## 62  51.46  20  311
## 76  59.64  25  311
## 7    4.32   3  315
## 21  10.43   5  315
## 35  27.16  10  315
## 49  40.85  15  315
## 63  51.33  20  315
## 77  60.07  25  315
## 8    4.57   3  319
## 22  10.57   5  319
## 36  27.90  10  319
## 50  41.13  15  319
## 64  52.43  20  319
## 78  60.69  25  319
## 9    3.77   3  321
## 23   9.03   5  321
## 37  25.45  10  321
## 51  38.98  15  321
## 65  49.76  20  321
## 79  60.28  25  321
## 10   4.33   3  323
## 24  10.79   5  323
## 38  28.97  10  323
## 52  42.44  15  323
## 66  53.17  20  323
## 80  61.62  25  323
## 11   4.38   3  325
## 25  10.48   5  325
## 39  27.93  10  325
## 53  40.20  15  325
## 67  50.06  20  325
## 81  58.49  25  325
## 12   4.12   3  327
## 26   9.92   5  327
## 40  26.54  10  327
## 54  37.82  15  327
## 68  48.43  20  327
# Create a dataframe for your 2 variables 

df3 <- data.frame(realdf2$height, chickwts$weight)
?plot
## Help on topic 'plot' was found in the following packages:
## 
##   Package               Library
##   graphics              /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...
plot(realdf2$height ~ chickwts$weight, 
     main = "Tree Heights vs Chicken Weight",
     xlab = "Chicken Weight",
     ylab = "Tree Height"
     )

# Calculating cor 

cor( realdf2$height, chickwts$weight )
## [1] -0.05429056

We have -0.05 for correlation. Meaning that we have extremely low correlation. The direction is also negative explaining that an increase in one variable will be accompanied by a decreases in the other.

cor(df3)
##                 realdf2.height chickwts.weight
## realdf2.height      1.00000000     -0.05429056
## chickwts.weight    -0.05429056      1.00000000

Creating the linear regresion

#Creating the model 

model <- lm(df3$realdf2.height ~ df3$chickwts.weight, data = df3)
summary(model)
## 
## Call:
## lm(formula = df3$realdf2.height ~ df3$chickwts.weight, data = df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.970 -22.749  -2.116  19.460  30.883 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         36.07823    8.70974   4.142 9.59e-05 ***
## df3$chickwts.weight -0.01443    0.03195  -0.452    0.653    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.87 on 69 degrees of freedom
## Multiple R-squared:  0.002947,   Adjusted R-squared:  -0.0115 
## F-statistic: 0.204 on 1 and 69 DF,  p-value: 0.6529
plot(model)

model2 <- lm(df3$chickwts.weight ~ df3$realdf2.height, data = df3)
summary(model2)
## 
## Call:
## lm(formula = df3$chickwts.weight ~ df3$realdf2.height, data = df3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -158.979  -55.916   -4.024   61.933  155.974 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        267.9080    17.3285  15.461   <2e-16 ***
## df3$realdf2.height  -0.2042     0.4522  -0.452    0.653    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.52 on 69 degrees of freedom
## Multiple R-squared:  0.002947,   Adjusted R-squared:  -0.0115 
## F-statistic: 0.204 on 1 and 69 DF,  p-value: 0.6529
plot(model2)

## Slope estimate interpretation

For model 2: Y intercept = 267,

slope = -0.204, P = 0.653, r2 = 0.002947

Interpret the residuals

Residuals vs fitted: In our graph 2 for model 2 we can observate that our residuals shows a linear relationship.

Normal Q-Q: In our normal qq graph for model 2 we can see how the residuals follow a straight line althought, we have a few extreme observations.

Scale-Location: In our scale-location graph for model 2 we can see how the residuals are not spread equally along the range of predictors proving the assumption of homoscedasticity is not validated.

Residual vs leverage: In our graph for residual vs levarege for model 2 we can see that we dont no have any outiler that influences our linear regression analysis. Non of our cases are outside the Cook’s distance.

Did the Gauss Markow Assumptions hold ?

1) Model is linear in parameters: As we can see in our first plot the dataframe does not fall along a straight line.

2) Non-Collinearity: our two variables are independent. We can distiguish the effects of the two variables.

3) Homoscelasticity: Looking at our model 2, the graph for scale-location. We can see that it not showing constant variance. The points in our discretion are not similar and does not show uniformity.

4) The residuals are normally distributed as we can see in our graph Q-Q for model 2, the dataframes follows a normal distribution

Since 2 out of the 4 Gauss Markow Assumption holds, we can safely say that there is no correlation between height of trees and the weight of chickens.