# scientific notation -> decimal numbers
options(scipen=999)

Task 1

From Module 2, read the car.data.csv file, and provide a summary of the “doors,” “persons,” and “safety” variables. Provide a brief description of the variables.

Variable descriptions

The dataset describes different characteristics of cars. The dataset contians 1728 observations.

Doors : The number of doors on a car (Factor w/ 4 levels “2”,“3”,“4”,“5more”)
Persons : The number of people the car is rated for. (Factor w/ 3 levels “2”,“4”,“more”)
Safety : The safety rating of the car. (Factor w/ 3 levels “high”,“low”,“med”)

require(readr)

## Loading required package: readr

library(readr)
car_data <- read_csv("car.data.csv")

## Parsed with column specification:
## cols(
##   buying = col_character(),
##   maint = col_character(),
##   doors = col_character(),
##   persons = col_character(),
##   lug_boot = col_character(),
##   safety = col_character(),
##   rating = col_character()
## )

#Type transformations
car_data$safety = factor(car_data$safety)
car_data$maint = factor(car_data$maint)
car_data$buying = factor(car_data$buying)
car_data$lug_boot = factor(car_data$lug_boot)
car_data$rating = factor(car_data$rating)
car_data$doors = factor(car_data$doors)
car_data$persons = factor(car_data$persons)

attach(car_data)

Summary of the “doors,” “persons,” and “safety” variables

car_summary <- data.frame(doors, persons, safety)
summary(car_summary)

##    doors     persons     safety   
##  2    :432   2   :576   high:576  
##  3    :432   4   :576   low :576  
##  4    :432   more:576   med :576  
##  5more:432

#Garbage collection
detach(car_data, car_summary)
remove(car_data, car_summary)

Task 2

From Module 3, read the german data set, and provide a “table” analysis of the “status of existing checking account” variable. Briefly describe the output.

library(tibble)

# From: http://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)
d <- read.table('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data',
                stringsAsFactors=F,header=F)
colnames(d) <- c('Status.of.existing.checking.account', 'Duration.in.month',
                 'Credit.history', 'Purpose', 'Credit.amount', 'Savings.account.bonds', 
                 'Present.employment.since',
                 'Installment.rate.in.percentage.of.disposable.income',
                 'Personal.status.and.sex', 'Other.debtors.guarantors',
                 'Present.residence.since', 'Property', 'Age.in.years',
                 'Other.installment.plans', 'Housing',
                 'Number.of.existing.credits.at.this.bank', 'Job',
                 'Number.of.people.being.liable.to.provide.maintenance.for',
                 'Telephone', 'foreign.worker', 'Good.Loan')
mapping <- list('A11'='... < 0 DM',
                'A12'='0 <= ... < 200 DM',
                'A13'='... >= 200 DM / salary assignments for at least 1 year',
                'A14'='no checking account',
                'A30'='no credits taken/all credits paid back duly',
                'A31'='all credits at this bank paid back duly',
                'A32'='existing credits paid back duly till now',
                'A33'='delay in paying off in the past',
                'A34'='critical account/other credits existing (not at this bank)',
                'A40'='car (new)',
                'A41'='car (used)',
                'A42'='furniture/equipment',
                'A43'='radio/television',
                'A44'='domestic appliances',
                'A45'='repairs',
                'A46'='education',
                'A47'='(vacation - does not exist?)',
                'A48'='retraining',
                'A49'='business',
                'A410'='others',
                'A61'='... < 100 DM',
                'A62'='100 <= ... < 500 DM',
                'A63'='500 <= ... < 1000 DM',
                'A64'='.. >= 1000 DM',
                'A65'='unknown/ no savings account',
                'A71'='unemployed',
                'A72'='... < 1 year',
                'A73'='1 <= ... < 4 years',
                'A74'='4 <= ... < 7 years',
                'A75'='.. >= 7 years',
                'A91'='male : divorced/separated',
                'A92'='female : divorced/separated/married',
                'A93'='male : single',
                'A94'='male : married/widowed',
                'A95'='female : single',
                'A101'='none',
                'A102'='co-applicant',
                'A103'='guarantor',
                'A121'='real estate',
                'A122'='if not A121 : building society savings agreement/life insurance',
                'A123'='if not A121/A122 : car or other, not in attribute 6',
                'A124'='unknown / no property',
                'A141'='bank',
                'A142'='stores',
                'A143'='none',
                'A151'='rent',
                'A152'='own',
                'A153'='for free',
                'A171'='unemployed/ unskilled - non-resident',
                'A172'='unskilled - resident',
                'A173'='skilled employee / official',
                'A174'='management/ self-employed/highly qualified employee/ officer',
                'A191'='none',
                'A192'='yes, registered under the customers name',
                'A201'='yes',
                'A202'='no')
for(i in 1:(dim(d))[2]) {
  if(class(d[,i])=='character') {
    d[,i] <- as.factor(as.character(mapping[d[,i]]))
  }
}
d$Good.Loan <- as.factor(ifelse(d$Good.Loan==1,'GoodLoan','BadLoan'))
vars <- setdiff(colnames(d),'Good.Loan')
creditdata <- d

# not part of GCD data- notional example for listing 1.3
tab1 <- as.table(matrix(data=c(50,6,0,44),nrow=2,ncol=2))
dimnames(tab1) <- list('loan.as.pct.disposable.income'=c('LT.15pct','GT.15pct'),'loan.quality.pop1'=c('goodloan','badloan'))
tab2 <- as.table(matrix(data=c(34,18,16,32),nrow=2,ncol=2))
dimnames(tab2) <- list('loan.as.pct.disposable.income'=c('LT.15pct','GT.15pct'),'loan.quality.pop2'=c('goodloan','badloan'))

# Garbage collection
rm(mapping, d, tab1, tab2, vars, i)

From Module 3, read the german data set, and provide a “table” analysis of the “status of existing checking account” variable. Briefly describe the output.

summary(as_tibble(creditdata$Status.of.existing.checking.account))

## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.

##                                                     value    
##  ... < 0 DM                                            :274  
##  ... >= 200 DM / salary assignments for at least 1 year: 63  
##  0 <= ... < 200 DM                                     :269  
##  no checking account                                   :394

rm(creditdata)

Task 3

From Module 4, read the custdata.tsv data, and generate a bar chart of customers by state of residence in descending order. Provide a brief description of the chart

Prepare data:

library(dplyr, tibble)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

customerData.df <- read.table("./custdata.tsv", header = T, sep='\t')

custCountByState.tbl <- as_tibble(customerData.df) %>%
  # here's a cool verb, "count()
  # use ?count() to get a sense of what count can do
  # essentially count() does a counting by group of the variable of interest
  # so, count how many observations by state, and then sort by descending order
  count(state.of.res, sort = TRUE) %>% 
  # since the count variable is 'n' you may wish to provide some better name
  mutate(numberOfObservationsInState = n) %>%
  # and then delete 'n' from the data
  select(-n)

# Garbage collection
rm(customerData.df)

require(ggplot2)

## Loading required package: ggplot2

library(ggplot2)
ggplot(custCountByState.tbl) +
  geom_col(aes(x=reorder(state.of.res,numberOfObservationsInState), 
               y=numberOfObservationsInState), fill="blue") +
  coord_flip()

# Garbage collection
rm(custCountByState.tbl)

Task 4

From Module 5, read the psub.Rdata and perform an ordinary least squares (OLS) regression on the ENTIRE data. Personal income is your variable of interest, which will be a function of the explanatory variables age, sex, working class, and education level. Be sure to briefly explain the regression output

# Load data
load("psub.RData")

# Create datafile
varsForOLS <- psub %>%
  mutate(personalIncome = PINCP, groupingID = ORIGRANDGROUP, age = AGEP, sex = SEX,
         workingClass = COW, educationLevel = SCHL) %>%
  select(personalIncome, groupingID, age, sex, workingClass, educationLevel)
attach(varsForOLS)

#Get regression output
summary(lm(personalIncome ~ age + sex + workingClass + educationLevel + groupingID))

## 
## Call:
## lm(formula = personalIncome ~ age + sex + workingClass + educationLevel + 
##     groupingID)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -91692 -17398  -4120  11897 195790 
## 
## Coefficients:
##                                                 Estimate  Std. Error t value
## (Intercept)                                   -7434.0865   4744.8406  -1.567
## age                                            1066.7433     99.6603  10.704
## sexF                                         -11375.8825   1721.2481  -6.609
## workingClassFederal government employee        7034.8539   4523.4830   1.555
## workingClassLocal government employee         -7461.3689   3315.4484  -2.250
## workingClassPrivate not-for-profit employee   -8867.9873   2809.7547  -3.156
## workingClassSelf-employed incorporated         5512.2580   4883.9231   1.129
## workingClassSelf-employed not incorporated   -10866.7183   4610.8969  -2.357
## workingClassState government employee        -11493.7408   4246.4707  -2.707
## educationLevelAssociate's degree              21475.9598   3996.8974   5.373
## educationLevelBachelor's degree               37293.9782   3391.2106  10.997
## educationLevelDoctorate degree                47488.8295  10544.9251   4.503
## educationLevelGED or alternative credential    9461.4068   5275.3554   1.794
## educationLevelMaster's degree                 51238.1550   3985.9499  12.855
## educationLevelProfessional degree             84405.9363   7255.1298  11.634
## educationLevelRegular high school diploma      7923.2703   3309.3434   2.394
## educationLevelsome college credit, no degree  14543.8061   3343.7947   4.349
## groupingID                                        0.7427      2.8546   0.260
##                                                          Pr(>|t|)    
## (Intercept)                                               0.11743    
## age                                          < 0.0000000000000002 ***
## sexF                                              0.0000000000579 ***
## workingClassFederal government employee                   0.12016    
## workingClassLocal government employee                     0.02460 *  
## workingClassPrivate not-for-profit employee               0.00164 ** 
## workingClassSelf-employed incorporated                    0.25927    
## workingClassSelf-employed not incorporated                0.01859 *  
## workingClassState government employee                     0.00689 ** 
## educationLevelAssociate's degree                  0.0000000927900 ***
## educationLevelBachelor's degree              < 0.0000000000000002 ***
## educationLevelDoctorate degree                    0.0000073330317 ***
## educationLevelGED or alternative credential               0.07314 .  
## educationLevelMaster's degree                < 0.0000000000000002 ***
## educationLevelProfessional degree            < 0.0000000000000002 ***
## educationLevelRegular high school diploma                 0.01681 *  
## educationLevelsome college credit, no degree      0.0000147975404 ***
## groupingID                                                0.79477    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28670 on 1206 degrees of freedom
## Multiple R-squared:  0.3227, Adjusted R-squared:  0.3132 
## F-statistic:  33.8 on 17 and 1206 DF,  p-value: < 0.00000000000000022

# Garbage collection
rm(varsForOLS, psub)