library("readr")
bankData <- read_delim("C:/Users/trist/Documents/Homework 4/bankData.txt", delim=';')
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## age = col_double(),
## duration = col_double(),
## campaign = col_double(),
## pdays = col_double(),
## previous = col_double(),
## emp.var.rate = col_double(),
## cons.price.idx = col_double(),
## cons.conf.idx = col_double(),
## euribor3m = col_double(),
## nr.employed = col_double()
## )
## i Use `spec()` for the full column specifications.
hist(bankData$age, main = "Histogram of Ages in the Dataset with Smoothed Den", col = "Green", freq = FALSE, xlab = "Age of Person", ylab = "Density")
lines(density(bankData$age), lwd = 3, col = "Black", lty = 1)
abline(v = mean(bankData$age), lwd = 3, lty = 1, col = "Black")
symbols <- ifelse(bankData$duration == "divorced", 3, 3)
plot(x = bankData$age, y = bankData$duration, pch = symbols, xlab = "Age", ylab = "Duration")
legend(x = 90, y = 5000, legend = c("divorced", "married", "single", "unknown"), pch = c(3, 3))
twoTable <- table(bankData$marital, bankData$poutcome)
barplot(twoTable, beside = TRUE, main = "Barplot of Marital Status and Poutcome", legend = c("divorced", "married", "single", "unknown"),
args.legend=list(title="Marital Status"))
stackTable <- table(bankData$marital, bankData$poutcome)
barplot(stackTable, legend = TRUE, horiz = TRUE, main = "Barplot of Marital Status and Poutcome", args.legend = list(title="Marital Status", cex = .65))
boxplot(duration ~ month, data = bankData)
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/trist/Documents/R/win-library/4.0'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\trist\AppData\Local\Temp\RtmpCyOtE8\downloaded_packages
library(ggplot2)
bankdata <- ggplot(bankData, aes(age))
bankdata + geom_histogram(binwidth = 5, color="black", fill="green", aes(y=..density..)) + geom_density(kernel="gaussian") + ggtitle("Histogram of Ages in the Dataset with Smoothed Den") + xlab("Age of Person") + ylab("Density")
##Hardly anyone over the age of 60 years old was included in the dataset.##
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v purrr 0.3.4 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
bankdata + geom_point(shape = "X", aes(x = age, y = duration, color = marital)) + ggtitle("Scatterplot of Age by Duration") + xlab("Age") + ylab("Duration")
##After the age of 30, it seems like there is about a 70/20/10 split between married, divorced and single individuals. Hardly any variables were unknown.##
bankdata + geom_point(shape = "X", aes(x = age, y = duration, color = marital)) + facet_wrap(~ poutcome) + ggtitle("Scatterplot of Age by Duration/n for each value of poutcome") + xlab("Age") + ylab("Duration")
##After the age of 30, it seems like there is about a 70/20/10 split between married, divorced and single individuals. Hardly any variables were unknown.##
bankdata + geom_bar(position = "dodge", aes(x = marital, fill = poutcome)) + ggtitle("Barplot of Marital Status and Poutcome") + xlab("marital") + ylab("count")
##Over 35,000 respondents were listed under nonexistent.##
bankdata + geom_bar(position = "dodge", aes(x = marital, fill = poutcome)) + ggtitle("Barplot of Marital Status and Poutcome") + xlab("marital") + ylab("count") + coord_flip()
##Over 35,000 respondents were listed under nonexistent.##
bankdata + geom_boxplot(aes(x = month, y = duration)) + xlab("Month") + ylab("Duration of Campaign")
##August and November had relatively average median values but had by far the biggest outliers of the entire dataset.##