#Import Dataset
training <- read.csv("/Users/samuelsinger/Desktop/Homework #1 Econometrics/training.csv")
#Loading Packages
library(psych)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(ggplot2)
library(Hmisc)
library(dplyr)
library(pacman)
library(tidyverse)
library(summarytools)
library(visdat)
library(stargazer)
#Data Analysis
#Missing Observations Plot 1.
vis_miss(training,sort_miss = TRUE, show_perc = TRUE)

#Summary Stats
training.select <- training[2:17]
SummaryStats <- summarytools::descr(training.select)
View(SummaryStats)
dfSummary(SummaryStats,
          plain.ascii  = FALSE,
          style        = 'grid',
          graph.magnif = 0.85,
          varnumbers = FALSE,
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")
## SummaryStats was converted to a data frame
## ### Data Frame Summary  
## #### SummaryStats  
## **Dimensions:** 15 x 16  
## **Duplicates:** 0  
## 
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | Variable          | Stats / Values               | Freqs (% of Valid) | Graph                | Missing |
## +===================+==============================+====================+======================+=========+
## | TARGET_WINS\      | Mean (sd) : 193.3 (578.1)\   | 15 distinct values | ![](/tmp/ds0001.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | -0.4 < 21 < 2276\            |                    |                      |         |
## |                   | IQR (CV) : 86.4 (3)          |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BASERUN_CS\  | Mean (sd) : 136.5 (381.7)\   | 15 distinct values | ![](/tmp/ds0002.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0 < 24 < 1504\               |                    |                      |         |
## |                   | IQR (CV) : 52.6 (2.8)        |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BASERUN_SB\  | Mean (sd) : 242.1 (553.6)\   | 15 distinct values | ![](/tmp/ds0003.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0 < 87.8 < 2145\             |                    |                      |         |
## |                   | IQR (CV) : 109.1 (2.3)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BATTING_2B\  | Mean (sd) : 268.2 (571.2)\   | 15 distinct values | ![](/tmp/ds0004.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0 < 69 < 2276\               |                    |                      |         |
## |                   | IQR (CV) : 216.1 (2.1)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BATTING_3B\  | Mean (sd) : 193.3 (579)\     | 15 distinct values | ![](/tmp/ds0005.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0 < 34 < 2276\               |                    |                      |         |
## |                   | IQR (CV) : 62.3 (3)          |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BATTING_BB\  | Mean (sd) : 376.4 (593.4)\   | 15 distinct values | ![](/tmp/ds0006.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | -1 < 122.7 < 2276\           |                    |                      |         |
## |                   | IQR (CV) : 505.6 (1.6)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BATTING_H\   | Mean (sd) : 805.8 (901.4)\   | 15 distinct values | ![](/tmp/ds0007.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0.1 < 154.2 < 2554\          |                    |                      |         |
## |                   | IQR (CV) : 1408 (1.1)        |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BATTING_HBP\ | Mean (sd) : 40 (51.3)\       | 15 distinct values | ![](/tmp/ds0008.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | -0.1 < 16.5 < 191\           |                    |                      |         |
## |                   | IQR (CV) : 54.3 (1.3)        |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BATTING_HR\  | Mean (sd) : 218.3 (573.8)\   | 15 distinct values | ![](/tmp/ds0009.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | -1 < 78.6 < 2276\            |                    |                      |         |
## |                   | IQR (CV) : 103.1 (2.6)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_BATTING_SO\  | Mean (sd) : 503.1 (624.1)\   | 15 distinct values | ![](/tmp/ds0010.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | -0.3 < 284.7 < 2174\         |                    |                      |         |
## |                   | IQR (CV) : 742.6 (1.2)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_FIELDING_DP\ | Mean (sd) : 202.1 (500)\     | 15 distinct values | ![](/tmp/ds0011.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | -0.4 < 52 < 1990\            |                    |                      |         |
## |                   | IQR (CV) : 135.7 (2.5)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_FIELDING_E\  | Mean (sd) : 369.9 (706.1)\   | 15 distinct values | ![](/tmp/ds0012.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0.1 < 122.2 < 2276\          |                    |                      |         |
## |                   | IQR (CV) : 200.5 (1.9)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_PITCHING_BB\ | Mean (sd) : 580.1 (1024.2)\  | 15 distinct values | ![](/tmp/ds0013.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0 < 135 < 3645\              |                    |                      |         |
## |                   | IQR (CV) : 492.9 (1.8)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_PITCHING_H\  | Mean (sd) : 2802.8 (7601.7)\ | 15 distinct values | ![](/tmp/ds0014.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0.1 < 1137 < 30132\          |                    |                      |         |
## |                   | IQR (CV) : 1479.6 (2.7)      |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_PITCHING_HR\ | Mean (sd) : 224.5 (574.4)\   | 14 distinct values | ![](/tmp/ds0015.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | -0.6 < 74.1 < 2276\          |                    |                      |         |
## |                   | IQR (CV) : 105.9 (2.6)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
## | TEAM_PITCHING_SO\ | Mean (sd) : 1774.6 (4875.1)\ | 15 distinct values | ![](/tmp/ds0016.png) | 0\      |
## | [numeric]         | min < med < max:\            |                    |                      | (0.0%)  |
## |                   | 0 < 553.1 < 19278\           |                    |                      |         |
## |                   | IQR (CV) : 756.8 (2.7)       |                    |                      |         |
## +-------------------+------------------------------+--------------------+----------------------+---------+
#Exploratory Graphs (BIN)

variable_names <- names(training.select)
for (variable in variable_names) {
  variable_data <- training.select[[variable]] 
  variable_data <- variable_data[!is.na(variable_data)]
plot <- ggplot(data = data.frame(var = variable_data), aes(x = var)) +
  geom_density(fill = "blue", color = "black") +
  labs(title = paste(variable), x = variable, y = "Density")
print(plot)}

# Filling in Missing Observations 
REPLACE <- c(1:2276)
meanTeamBattingSO <- mean(training$TEAM_BATTING_SO,na.rm=TRUE)
training$MissingBattingSO <- ifelse(is.na(training$TEAM_BATTING_SO),1,0)
nonNABattingSO <- ifelse(is.na(training$TEAM_BATTING_SO),meanTeamBattingSO,training$TEAM_BATTING_SO)
training$TEAM_BATTING_SO[REPLACE] <- nonNABattingSO

meanTeamBaserunSB <- mean(training$TEAM_BASERUN_SB,na.rm=TRUE)
training$MissingBattingSB <- ifelse(is.na(training$TEAM_BASERUN_SB),1,0)
nonNABaserunSB <- ifelse(is.na(training$TEAM_BASERUN_SB),meanTeamBaserunSB,training$TEAM_BASERUN_SB)
training$TEAM_BASERUN_SB[REPLACE] <- nonNABaserunSB

meanTeamBaserunCS <- mean(training$TEAM_BASERUN_CS,na.rm=TRUE)
training$MissingBattingCS <- ifelse(is.na(training$TEAM_BASERUN_CS),1,0)
nonNABaserunCS <- ifelse(is.na(training$TEAM_BASERUN_CS),meanTeamBaserunCS,training$TEAM_BASERUN_CS)
training$TEAM_BASERUN_CS[REPLACE] <- nonNABaserunCS

#Think about dropping TEAM_BATTING_HBP entirely due to the low number of original observations#
meanTeamBattingHBP <- mean(training$TEAM_BATTING_HBP,na.rm=TRUE)
training$MissingBattingHBP <- ifelse(is.na(training$TEAM_BATTING_HBP),1,0)
nonNABattingHBP <- ifelse(is.na(training$TEAM_BATTING_HBP),meanTeamBattingHBP,training$TEAM_BATTING_HBP)
training$TEAM_BATTING_HBP[REPLACE] <- nonNABattingHBP

meanTeamPitchingSO <- mean(training$TEAM_PITCHING_SO,na.rm=TRUE)
training$MissingPitchingSO <- ifelse(is.na(training$TEAM_PITCHING_SO),1,0)
nonNAPitchingSO <- ifelse(is.na(training$TEAM_PITCHING_SO),meanTeamPitchingSO,training$TEAM_PITCHING_SO)
training$TEAM_PITCHING_SO[REPLACE] <- nonNAPitchingSO

meanTeamFieldingDP <- mean(training$TEAM_FIELDING_DP,na.rm=TRUE)
training$MissingFieldingDP <- ifelse(is.na(training$TEAM_FIELDING_DP),1,0)
nonNAFieldingDP <- ifelse(is.na(training$TEAM_FIELDING_DP),meanTeamFieldingDP,training$TEAM_FIELDING_DP)
training$TEAM_FIELDING_DP[REPLACE] <- nonNAFieldingDP

#Removing HBP and CS
training <- training %>% select(-TEAM_BATTING_HBP)
training <- training %>% select(-TEAM_BASERUN_CS)
#Correlation Plot and Proof of no missing observations 
training.select2 <- training[2:15]
training.cor = cor(training.select2)
palette = colorRampPalette(c("green", "white", "red")) (20)
heatmap(x = training.cor, col = palette, symm = TRUE)

View(training)
vis_miss(training,sort_miss = TRUE, show_perc = TRUE)