library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(curl)
## Using libcurl 7.79.1 with LibreSSL/3.3.5
##
## Attaching package: 'curl'
##
## The following object is masked from 'package:readr':
##
## parse_date
library(ggplot2)
library(dplyr)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(corrplot)
## corrplot 0.92 loaded
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
set.seed(1234)
X <- runif(10000, min = 5, max = 15)
Y <- rnorm(10000, mean = 10, sd = 2.89)
x <- median(X)
y <- median(Y)
#p(X>x | X>y)
p_a <- sum(X>x & X>y)/sum(X>y)
#p(X>x & Y>y)
p_b <- sum(X>x & Y>y)/length(X)
#p(X<x | X>y)
p_c <- sum(X<x & X>y)/sum(X>y)
#joint and marginal probabilities
tbl <- table(X > x, Y > y)
jp <- tbl / sum(tbl)
jp
##
## FALSE TRUE
## FALSE 0.2507 0.2493
## TRUE 0.2493 0.2507
m_X <- table(X > x) / length(X)
m_Y <- table(Y > y) / length(Y)
product_prob <- m_X * m_Y
product_prob
##
## FALSE TRUE
## 0.25 0.25
Check to see if independence holds by using Fisher’s Exact Test and the Chi Square Test. What is the difference between the two? Which is most appropriate? Are you surprised at the results? Why or why not?
fisher <- fisher.test(tbl)
chi_square <- chisq.test(tbl)
fisher
##
## Fisher's Exact Test for Count Data
##
## data: tbl
## p-value = 0.7949
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9342763 1.0946016
## sample estimates:
## odds ratio
## 1.011264
chi_square
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tbl
## X-squared = 0.0676, df = 1, p-value = 0.7949
test <- read.csv(curl("https://raw.githubusercontent.com/brsingh7/Data/Data605_Final/test.csv"))
train <- read.csv(curl("https://raw.githubusercontent.com/brsingh7/Data/Data605_Final/train.csv"))
summary(train)
## id Sex Length Diameter
## Min. : 0 Length:74051 Min. :0.1875 Min. :0.1375
## 1st Qu.:18512 Class :character 1st Qu.:1.1500 1st Qu.:0.8875
## Median :37025 Mode :character Median :1.3750 Median :1.0750
## Mean :37025 Mean :1.3175 Mean :1.0245
## 3rd Qu.:55538 3rd Qu.:1.5375 3rd Qu.:1.2000
## Max. :74050 Max. :2.0128 Max. :1.6125
## Height Weight Shucked.Weight Viscera.Weight
## Min. :0.0000 Min. : 0.0567 Min. : 0.02835 Min. : 0.04252
## 1st Qu.:0.3000 1st Qu.:13.4377 1st Qu.: 5.71242 1st Qu.: 2.86330
## Median :0.3625 Median :23.7994 Median : 9.90815 Median : 4.98951
## Mean :0.3481 Mean :23.3852 Mean :10.10427 Mean : 5.05839
## 3rd Qu.:0.4125 3rd Qu.:32.1625 3rd Qu.:14.03300 3rd Qu.: 6.98815
## Max. :2.8250 Max. :80.1015 Max. :42.18406 Max. :21.54562
## Shell.Weight Age
## Min. : 0.04252 Min. : 1.000
## 1st Qu.: 3.96893 1st Qu.: 8.000
## Median : 6.93145 Median :10.000
## Mean : 6.72387 Mean : 9.968
## 3rd Qu.: 9.07184 3rd Qu.:11.000
## Max. :28.49125 Max. :29.000
train %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_boxplot()
train %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
train %>%
gather(-id,-Age, key="var", value="value") %>%
ggplot(aes(x=value,y=Age))+
geom_point() +
geom_smooth(method="lm",color="red") +
facet_wrap(~ var, scales="free") +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
#correlation matrix & hypothesis testing
corr_mat <- cor(train[, c("Diameter", "Height", "Age")])
corr_mat
## Diameter Height Age
## Diameter 1.0000000 0.9213531 0.6212559
## Height 0.9213531 1.0000000 0.6380669
## Age 0.6212559 0.6380669 1.0000000
corr_hyptest_a <- cor.test(train$Diameter, train$Height, conf.level = 0.80)
corr_hyptest_b <- cor.test(train$Diameter, train$Age, conf.level = 0.80)
corr_hyptest_c <- cor.test(train$Height, train$Age, conf.level = 0.80)
corr_hyptest_a
##
## Pearson's product-moment correlation
##
## data: train$Diameter and train$Height
## t = 644.97, df = 74049, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.9206384 0.9220617
## sample estimates:
## cor
## 0.9213531
corr_hyptest_b
##
## Pearson's product-moment correlation
##
## data: train$Diameter and train$Age
## t = 215.74, df = 74049, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6183556 0.6241393
## sample estimates:
## cor
## 0.6212559
corr_hyptest_c
##
## Pearson's product-moment correlation
##
## data: train$Height and train$Age
## t = 225.5, df = 74049, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.6352664 0.6408507
## sample estimates:
## cor
## 0.6380669
#precision matrix
prec_mat <- solve(corr_mat)
#correlation * precision
a <- corr_mat %*% prec_mat
#precision * correlation
b <- prec_mat %*% corr_mat
#LDU Decomposition
ldu_decomp <- lu(b)
elu <- expand(ldu_decomp)
L<-elu$L
L
## 3 x 3 Matrix of class "dtrMatrix" (unitriangular)
## [,1] [,2] [,3]
## [1,] 1.000000e+00 . .
## [2,] 1.081651e-16 1.000000e+00 .
## [3,] -5.457358e-17 -1.111439e-16 1.000000e+00
U <- elu$U
U
## 3 x 3 Matrix of class "dtrMatrix"
## [,1] [,2] [,3]
## [1,] 1.000000e+00 -1.186970e-16 -2.775558e-16
## [2,] . 1.000000e+00 3.002183e-32
## [3,] . . 1.000000e+00
right_skewed <- train$Length
#it doesn't appear shifting is necessary, all values above 0
exp_distr <- fitdistr(right_skewed, "exponential")
lambda <- exp_distr$estimate["rate"]
sample2 <- rexp(1000, rate = lambda)
#compare histograms
par(mfrow = c(1, 2))
hist(train$Length, probability = TRUE)
hist(sample2, main = "Exponentilal", probability = TRUE)
# Calculate 5th and 95th percentiles using the CDF
percentile5_cdf <- qexp(0.05, rate = lambda)
percentile5_cdf
## [1] 0.06757686
percentile95_cdf <- qexp(0.95, rate = lambda)
percentile95_cdf
## [1] 3.946757
#95% confidence interval
ci <- qnorm(c(0.025, 0.975), mean = mean(right_skewed), sd = sd(right_skewed))
ci
## [1] 0.7534664 1.8814536
#empirical 5th and 95th percentiles
percentile5_emp <- quantile(train$Length, 0.05)
percentile95_emp <- quantile(train$Length, 0.95)
percentile5_emp
## 5%
## 0.7375
percentile95_emp
## 95%
## 1.675
#predict age based on length, diameter, height, and weight
#lm_mod <- lm(Age ~ Length + Diameter + Height + Weight, data = train)
#summary(lm_mod)
#residuals
#plot(residuals(lm_mod))
#par(mfrow = c(2, 2))
#plot(lm_mod)
#prediction <- predict(lm_mod,newdata = test)
#prediction
#rmse <- sqrt(mean(prediction-train$Age)^2)
#rmse
lm_mod2 <- lm_mod <- lm(Age ~ ., data = train)
lm3<- step(lm_mod2, direction = c("backward"))
## Start: AIC=111867.1
## Age ~ id + Sex + Length + Diameter + Height + Weight + Shucked.Weight +
## Viscera.Weight + Shell.Weight
##
## Df Sum of Sq RSS AIC
## - id 1 0 335336 111865
## <none> 335336 111867
## - Length 1 102 335438 111888
## - Diameter 1 363 335699 111945
## - Viscera.Weight 1 1504 336840 112197
## - Height 1 4232 339568 112794
## - Weight 1 5826 341163 113141
## - Sex 2 8658 343994 113751
## - Shell.Weight 1 12318 347654 114537
## - Shucked.Weight 1 38838 374174 119980
##
## Step: AIC=111865.1
## Age ~ Sex + Length + Diameter + Height + Weight + Shucked.Weight +
## Viscera.Weight + Shell.Weight
##
## Df Sum of Sq RSS AIC
## <none> 335336 111865
## - Length 1 102 335438 111886
## - Diameter 1 363 335700 111943
## - Viscera.Weight 1 1504 336840 112195
## - Height 1 4232 339568 112792
## - Weight 1 5826 341163 113139
## - Sex 2 8658 343994 113749
## - Shell.Weight 1 12318 347654 114535
## - Shucked.Weight 1 38838 374174 119978
summary(lm3)
##
## Call:
## lm(formula = Age ~ Sex + Length + Diameter + Height + Weight +
## Shucked.Weight + Viscera.Weight + Shell.Weight, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0347 -1.2228 -0.3320 0.7537 22.0418
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.755758 0.075897 49.485 < 2e-16 ***
## SexI -1.040735 0.025209 -41.284 < 2e-16 ***
## SexM -0.115212 0.019157 -6.014 1.82e-09 ***
## Length 0.910211 0.192234 4.735 2.20e-06 ***
## Diameter 2.138507 0.238786 8.956 < 2e-16 ***
## Height 7.222272 0.236275 30.567 < 2e-16 ***
## Weight 0.194265 0.005416 35.867 < 2e-16 ***
## Shucked.Weight -0.614115 0.006632 -92.603 < 2e-16 ***
## Viscera.Weight -0.216351 0.011872 -18.224 < 2e-16 ***
## Shell.Weight 0.512127 0.009820 52.152 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.128 on 74041 degrees of freedom
## Multiple R-squared: 0.5508, Adjusted R-squared: 0.5508
## F-statistic: 1.009e+04 on 9 and 74041 DF, p-value: < 2.2e-16
#residuals
plot(residuals(lm3))
par(mfrow = c(2, 2))
plot(lm3)
prediction3 <- predict(lm3,newdata = test)
rmse <- sqrt(mean(prediction3-train$Age)^2)
## Warning in prediction3 - train$Age: longer object length is not a multiple of
## shorter object length
rmse
## [1] 0.0167269
kaggle_submission <- data.frame(Id = test$id, Age = prediction3)
write.csv(kaggle_submission, "BrianSingh_kaggle_crabsubmission.csv", row.names = FALSE)
Username: briansingh7, score: 1.48