#Assignment 1. Regression model estimation

#Cargar librerias
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr   1.1.2     ✔ readr   2.1.4
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     ✔ tidyr   1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(foreign)
library(timsac)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(mFilter)
library(nlme)
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(lmtest)
library(broom)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(knitr)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(parallel)
library(mlogit)
## Loading required package: dfidx
## 
## Attaching package: 'dfidx'
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(dplyr)
library(tidyr)
library(forecast)
## 
## Attaching package: 'forecast'
## 
## The following object is masked from 'package:nlme':
## 
##     getResponse
library(stats)
library(quantmod)
## Loading required package: xts
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
library(foreach)
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
library(ISLR)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7
library(pacman)

##Read a database in Rstudio or Posit Cloud

fd <- read_csv("Fish.csv")
## Rows: 159 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Species
## dbl (6): Weight, Length1, Length2, Length3, Height, Width
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Look for a relationship between two variables that could make sense

plot(fd$Length1, fd$Weight,
     xlab = "Length1", ylab = "Weight",
     main = "Weight vs Length1")

correlation <- cor(fd$Length1, fd$Weight)
print(paste("Correlation Coefficient:", correlation))
## [1] "Correlation Coefficient: 0.915711716031204"

Do a simple linear regression and explain the results

lr <- lm(Weight ~ Length1, data = fd)
summary(lr)
## 
## Call:
## lm(formula = Weight ~ Length1, data = fd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -414.90  -71.26    6.08   98.31  363.84 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -462.375     32.243  -14.34   <2e-16 ***
## Length1       32.792      1.148   28.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 144.3 on 157 degrees of freedom
## Multiple R-squared:  0.8385, Adjusted R-squared:  0.8375 
## F-statistic: 815.3 on 1 and 157 DF,  p-value: < 2.2e-16
plot(fd$Length1, fd$Weight,
     xlab = "Length1", ylab = "Weight",
     main = " Simple Linear Regression")

##Do a multiple linear regression

mlr <- lm(Weight ~ Length1 + Length2 + Length3 + Height + Width, data = fd)
summary(mlr)
## 
## Call:
## lm(formula = Weight ~ Length1 + Length2 + Length3 + Height + 
##     Width, data = fd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -243.69  -65.10  -25.52   57.98  447.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -499.587     29.572 -16.894  < 2e-16 ***
## Length1       62.355     40.209   1.551  0.12302    
## Length2       -6.527     41.759  -0.156  0.87601    
## Length3      -29.026     17.353  -1.673  0.09643 .  
## Height        28.297      8.729   3.242  0.00146 ** 
## Width         22.473     20.372   1.103  0.27169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 123.2 on 153 degrees of freedom
## Multiple R-squared:  0.8853, Adjusted R-squared:  0.8815 
## F-statistic: 236.2 on 5 and 153 DF,  p-value: < 2.2e-16

##Create a polynomial regression

degree <- 2  
pr <- lm(Weight ~ poly(Length1, degree), data = fd)
summary(pr)
## 
## Call:
## lm(formula = Weight ~ poly(Length1, degree), data = fd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -428.27  -62.07   -2.54   75.17  384.55 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              398.33      11.17  35.661  < 2e-16 ***
## poly(Length1, degree)1  4120.44     140.85  29.255  < 2e-16 ***
## poly(Length1, degree)2   417.99     140.85   2.968  0.00347 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 140.8 on 156 degrees of freedom
## Multiple R-squared:  0.8472, Adjusted R-squared:  0.8452 
## F-statistic: 432.3 on 2 and 156 DF,  p-value: < 2.2e-16
value <- fd$Weight
pred <- predict(pr, newdata = fd)
plot(value, pred,
     xlab = "Weight", ylab = "Weight Prediction",
     main = "Weight vs. Predicted Weight Polynomial Regression")

#Insights ###We used the two variables of Length and Weight, because we wanted to compare if the weight of the animal, in this case fish, also influences its length. ###The correlation coefficient is closer to 1, that means you have a much stronger relationship in the model. ###The difference between simple and multiple regression is in the number of independent variables used to predict the independent variable. ###In simple regression there is only one independent variable that is used to predict the dependent variable. Also, the relationship of the two variables is shown as a straight line within the model. ###In multiple regression there are two or more variables that can be used to predict the dependent variable. And the relationship is modeled as a linear combination for the independent variables.