data_2012 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2012.csv")
## New names:
## Rows: 53368 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (14): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, mths_since_last_del...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2013 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2013.csv")
## New names:
## Rows: 134814 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2014 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2014.csv")
## New names:
## Rows: 235629 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2015 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2015.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 421095 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2016 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2016.csv")
## New names:
## Rows: 434407 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne... dbl
## (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
data_2017 <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\data2017.csv")
## New names:
## • `` -> `...1`
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 443579 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (16): ...1, loan_amnt, annual_inc, dti, delinq_2yrs, inq_last_6mths, mth...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_states <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\States.csv")
## Rows: 52 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Geography
## dbl (7): Num_Households, Median_income_Households, Unemployment rate; Estima...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_regions <- read_csv("F:\\2022_NEC\\R\\Week11\\Final Case Analysis\\states_regions.csv")
## New names:
## Rows: 51 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): State, State Code, Region, Division lgl (1): ...5
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...5`
#Concatenate all 6 tables into 1
LC_data <- rbind(data_2012, data_2013, data_2014, data_2015, data_2016, data_2017)
#dropped the previous index column because it doesn't apply to the new table, dropped the issue_d column because we already have that divided into issue_Month and issue_Year column.
LC_data_drop_columns <- subset(LC_data, select= -c(...1, issue_d))
#Loan Amount
ggplot(data = LC_data_drop_columns) +
geom_histogram(mapping = aes(x = loan_amnt), binwidth = 1000)
## Warning: Removed 12 rows containing non-finite values (stat_bin).
summary(LC_data_drop_columns$loan_amnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1000 8000 12800 14863 20000 40000 12
#Loan Amount
ggplot(data = LC_data_drop_columns) +
geom_bar(mapping = aes(x = emp_length))
#Annual Income
ggplot(data = LC_data_drop_columns) +
geom_histogram(mapping = aes(x = annual_inc), binwidth = 10000)
## Warning: Removed 16 rows containing non-finite values (stat_bin).
summary(LC_data_drop_columns$annual_inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 46500 65000 77607 92000 110000000 16
#I don't know if the extremely large number is wrong value or not, but for the plotting purpose, I'll zoom to a smaller values of the x-axis
ggplot(data = LC_data_drop_columns) +
geom_histogram(mapping = aes(x = annual_inc), binwidth = 10000)+
coord_cartesian(xlim = c(0, 1000000))
## Warning: Removed 16 rows containing non-finite values (stat_bin).
ggplot(data = LC_data_drop_columns) +
geom_histogram(mapping = aes(x = annual_inc), binwidth = 5000)+
coord_cartesian(xlim = c(0, 300000))
## Warning: Removed 16 rows containing non-finite values (stat_bin).
ggplot(data = LC_data_drop_columns) +
geom_bar(mapping = aes(x = inq_last_6mths))
ggplot(data = LC_data_drop_columns, aes(loan_amnt, color = grade)) + geom_histogram(binwidth = 1000) +
facet_grid(grade ~ .)
## Warning: Removed 12 rows containing non-finite values (stat_bin).
ggplot(data = LC_data_drop_columns) +
geom_point(mapping = aes(x = annual_inc, y = loan_amnt))+
coord_cartesian(xlim = c(0, 500000))
## Warning: Removed 16 rows containing missing values (geom_point).
ggplot(data = LC_data_drop_columns) +
geom_bin2d(mapping = aes(x = annual_inc,y=loan_amnt))+
coord_cartesian(xlim = c(0, 12000000))
## Warning: Removed 16 rows containing non-finite values (stat_bin2d).