# scientific notation -> decimal numbers
options(scipen=999)
From Module 2, read the car.data.csv file, and provide a summary of the “doors,” “persons,” and “safety” variables. Provide a brief description of the variables.
The dataset describes different characteristics of cars. The dataset contians 1728 observations.
require(readr)
## Loading required package: readr
library(readr)
car_data <- read_csv("car.data.csv")
## Parsed with column specification:
## cols(
## buying = col_character(),
## maint = col_character(),
## doors = col_character(),
## persons = col_character(),
## lug_boot = col_character(),
## safety = col_character(),
## rating = col_character()
## )
#Type transformations
car_data$safety = factor(car_data$safety)
car_data$maint = factor(car_data$maint)
car_data$buying = factor(car_data$buying)
car_data$lug_boot = factor(car_data$lug_boot)
car_data$rating = factor(car_data$rating)
car_data$doors = factor(car_data$doors)
car_data$persons = factor(car_data$persons)
attach(car_data)
car_summary <- data.frame(doors, persons, safety)
summary(car_summary)
## doors persons safety
## 2 :432 2 :576 high:576
## 3 :432 4 :576 low :576
## 4 :432 more:576 med :576
## 5more:432
#Garbage collection
detach(car_data, car_summary)
remove(car_data, car_summary)
From Module 3, read the german data set, and provide a “table” analysis of the “status of existing checking account” variable. Briefly describe the output.
library(tibble)
# From: http://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)
d <- read.table('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data',
stringsAsFactors=F,header=F)
colnames(d) <- c('Status.of.existing.checking.account', 'Duration.in.month',
'Credit.history', 'Purpose', 'Credit.amount', 'Savings.account.bonds',
'Present.employment.since',
'Installment.rate.in.percentage.of.disposable.income',
'Personal.status.and.sex', 'Other.debtors.guarantors',
'Present.residence.since', 'Property', 'Age.in.years',
'Other.installment.plans', 'Housing',
'Number.of.existing.credits.at.this.bank', 'Job',
'Number.of.people.being.liable.to.provide.maintenance.for',
'Telephone', 'foreign.worker', 'Good.Loan')
mapping <- list('A11'='... < 0 DM',
'A12'='0 <= ... < 200 DM',
'A13'='... >= 200 DM / salary assignments for at least 1 year',
'A14'='no checking account',
'A30'='no credits taken/all credits paid back duly',
'A31'='all credits at this bank paid back duly',
'A32'='existing credits paid back duly till now',
'A33'='delay in paying off in the past',
'A34'='critical account/other credits existing (not at this bank)',
'A40'='car (new)',
'A41'='car (used)',
'A42'='furniture/equipment',
'A43'='radio/television',
'A44'='domestic appliances',
'A45'='repairs',
'A46'='education',
'A47'='(vacation - does not exist?)',
'A48'='retraining',
'A49'='business',
'A410'='others',
'A61'='... < 100 DM',
'A62'='100 <= ... < 500 DM',
'A63'='500 <= ... < 1000 DM',
'A64'='.. >= 1000 DM',
'A65'='unknown/ no savings account',
'A71'='unemployed',
'A72'='... < 1 year',
'A73'='1 <= ... < 4 years',
'A74'='4 <= ... < 7 years',
'A75'='.. >= 7 years',
'A91'='male : divorced/separated',
'A92'='female : divorced/separated/married',
'A93'='male : single',
'A94'='male : married/widowed',
'A95'='female : single',
'A101'='none',
'A102'='co-applicant',
'A103'='guarantor',
'A121'='real estate',
'A122'='if not A121 : building society savings agreement/life insurance',
'A123'='if not A121/A122 : car or other, not in attribute 6',
'A124'='unknown / no property',
'A141'='bank',
'A142'='stores',
'A143'='none',
'A151'='rent',
'A152'='own',
'A153'='for free',
'A171'='unemployed/ unskilled - non-resident',
'A172'='unskilled - resident',
'A173'='skilled employee / official',
'A174'='management/ self-employed/highly qualified employee/ officer',
'A191'='none',
'A192'='yes, registered under the customers name',
'A201'='yes',
'A202'='no')
for(i in 1:(dim(d))[2]) {
if(class(d[,i])=='character') {
d[,i] <- as.factor(as.character(mapping[d[,i]]))
}
}
d$Good.Loan <- as.factor(ifelse(d$Good.Loan==1,'GoodLoan','BadLoan'))
vars <- setdiff(colnames(d),'Good.Loan')
creditdata <- d
# not part of GCD data- notional example for listing 1.3
tab1 <- as.table(matrix(data=c(50,6,0,44),nrow=2,ncol=2))
dimnames(tab1) <- list('loan.as.pct.disposable.income'=c('LT.15pct','GT.15pct'),'loan.quality.pop1'=c('goodloan','badloan'))
tab2 <- as.table(matrix(data=c(34,18,16,32),nrow=2,ncol=2))
dimnames(tab2) <- list('loan.as.pct.disposable.income'=c('LT.15pct','GT.15pct'),'loan.quality.pop2'=c('goodloan','badloan'))
# Garbage collection
rm(mapping, d, tab1, tab2, vars, i)
From Module 3, read the german data set, and provide a “table” analysis of the “status of existing checking account” variable. Briefly describe the output.
summary(as_tibble(creditdata$Status.of.existing.checking.account))
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
## value
## ... < 0 DM :274
## ... >= 200 DM / salary assignments for at least 1 year: 63
## 0 <= ... < 200 DM :269
## no checking account :394
rm(creditdata)
From Module 4, read the custdata.tsv data, and generate a bar chart of customers by state of residence in descending order. Provide a brief description of the chart
Prepare data:
library(dplyr, tibble)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
customerData.df <- read.table("./custdata.tsv", header = T, sep='\t')
custCountByState.tbl <- as_tibble(customerData.df) %>%
# here's a cool verb, "count()
# use ?count() to get a sense of what count can do
# essentially count() does a counting by group of the variable of interest
# so, count how many observations by state, and then sort by descending order
count(state.of.res, sort = TRUE) %>%
# since the count variable is 'n' you may wish to provide some better name
mutate(numberOfObservationsInState = n) %>%
# and then delete 'n' from the data
select(-n)
# Garbage collection
rm(customerData.df)
require(ggplot2)
## Loading required package: ggplot2
library(ggplot2)
ggplot(custCountByState.tbl) +
geom_col(aes(x=reorder(state.of.res,numberOfObservationsInState),
y=numberOfObservationsInState), fill="blue") +
coord_flip()
# Garbage collection
rm(custCountByState.tbl)
From Module 5, read the psub.Rdata and perform an ordinary least squares (OLS) regression on the ENTIRE data. Personal income is your variable of interest, which will be a function of the explanatory variables age, sex, working class, and education level. Be sure to briefly explain the regression output
# Load data
load("psub.RData")
# Create datafile
varsForOLS <- psub %>%
mutate(personalIncome = PINCP, groupingID = ORIGRANDGROUP, age = AGEP, sex = SEX,
workingClass = COW, educationLevel = SCHL) %>%
select(personalIncome, groupingID, age, sex, workingClass, educationLevel)
attach(varsForOLS)
#Get regression output
summary(lm(personalIncome ~ age + sex + workingClass + educationLevel + groupingID))
##
## Call:
## lm(formula = personalIncome ~ age + sex + workingClass + educationLevel +
## groupingID)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91692 -17398 -4120 11897 195790
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -7434.0865 4744.8406 -1.567
## age 1066.7433 99.6603 10.704
## sexF -11375.8825 1721.2481 -6.609
## workingClassFederal government employee 7034.8539 4523.4830 1.555
## workingClassLocal government employee -7461.3689 3315.4484 -2.250
## workingClassPrivate not-for-profit employee -8867.9873 2809.7547 -3.156
## workingClassSelf-employed incorporated 5512.2580 4883.9231 1.129
## workingClassSelf-employed not incorporated -10866.7183 4610.8969 -2.357
## workingClassState government employee -11493.7408 4246.4707 -2.707
## educationLevelAssociate's degree 21475.9598 3996.8974 5.373
## educationLevelBachelor's degree 37293.9782 3391.2106 10.997
## educationLevelDoctorate degree 47488.8295 10544.9251 4.503
## educationLevelGED or alternative credential 9461.4068 5275.3554 1.794
## educationLevelMaster's degree 51238.1550 3985.9499 12.855
## educationLevelProfessional degree 84405.9363 7255.1298 11.634
## educationLevelRegular high school diploma 7923.2703 3309.3434 2.394
## educationLevelsome college credit, no degree 14543.8061 3343.7947 4.349
## groupingID 0.7427 2.8546 0.260
## Pr(>|t|)
## (Intercept) 0.11743
## age < 0.0000000000000002 ***
## sexF 0.0000000000579 ***
## workingClassFederal government employee 0.12016
## workingClassLocal government employee 0.02460 *
## workingClassPrivate not-for-profit employee 0.00164 **
## workingClassSelf-employed incorporated 0.25927
## workingClassSelf-employed not incorporated 0.01859 *
## workingClassState government employee 0.00689 **
## educationLevelAssociate's degree 0.0000000927900 ***
## educationLevelBachelor's degree < 0.0000000000000002 ***
## educationLevelDoctorate degree 0.0000073330317 ***
## educationLevelGED or alternative credential 0.07314 .
## educationLevelMaster's degree < 0.0000000000000002 ***
## educationLevelProfessional degree < 0.0000000000000002 ***
## educationLevelRegular high school diploma 0.01681 *
## educationLevelsome college credit, no degree 0.0000147975404 ***
## groupingID 0.79477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28670 on 1206 degrees of freedom
## Multiple R-squared: 0.3227, Adjusted R-squared: 0.3132
## F-statistic: 33.8 on 17 and 1206 DF, p-value: < 0.00000000000000022
# Garbage collection
rm(varsForOLS, psub)