#set directory #create data table , then rename so you maintain the original #view data #install libraries
setwd("/Users/carolyn.khalil/Desktop/R-Tutorial")
PG2<- read.csv("R-tutorial/Data/PilgrimData.csv")
head(PG2)
## ID X9Profit X9Online X9Age X9Inc X9Tenure X9District X0Profit X0Online
## 1 1 21 0 NA NA 6.33 1200 NA NA
## 2 2 -6 0 6 3 29.50 1200 -32 0
## 3 3 -49 1 5 5 26.41 1100 -22 1
## 4 4 -4 0 NA NA 2.25 1200 NA NA
## 5 5 -61 0 2 9 9.91 1200 -4 0
## 6 6 -38 0 NA 3 2.33 1300 14 0
## X9Billpay X0Billpay
## 1 0 NA
## 2 0 0
## 3 0 0
## 4 0 NA
## 5 0 0
## 6 0 0
summary(PG2)
## ID X9Profit X9Online X9Age
## Min. : 1 Min. :-221.0 Min. :0.0000 Min. :1.000
## 1st Qu.: 7909 1st Qu.: -34.0 1st Qu.:0.0000 1st Qu.:3.000
## Median :15818 Median : 9.0 Median :0.0000 Median :4.000
## Mean :15818 Mean : 111.5 Mean :0.1218 Mean :4.046
## 3rd Qu.:23726 3rd Qu.: 164.0 3rd Qu.:0.0000 3rd Qu.:5.000
## Max. :31634 Max. :2071.0 Max. :1.0000 Max. :7.000
## NA's :8289
## X9Inc X9Tenure X9District X0Profit
## Min. :1.000 Min. : 0.16 Min. :1100 Min. :-5643.0
## 1st Qu.:4.000 1st Qu.: 3.75 1st Qu.:1200 1st Qu.: -30.0
## Median :6.000 Median : 7.41 Median :1200 Median : 23.0
## Mean :5.459 Mean :10.16 Mean :1203 Mean : 144.8
## 3rd Qu.:7.000 3rd Qu.:14.75 3rd Qu.:1200 3rd Qu.: 206.0
## Max. :9.000 Max. :41.16 Max. :1300 Max. :27086.0
## NA's :8261 NA's :5238
## X0Online X9Billpay X0Billpay
## Min. :0.000 Min. :0.00000 Min. :0.00
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00
## Median :0.000 Median :0.00000 Median :0.00
## Mean :0.199 Mean :0.01669 Mean :0.03
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00
## Max. :1.000 Max. :1.00000 Max. :1.00
## NA's :5219 NA's :5219
library(stats)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(base)
library(ggplot2)
#what we can see here is ….. (data types)
#normalize function ``` { r} normalize <- function(x) { num <- x - min(x) denom <- max(x) - min(x) return (num/denom) }
#Normalization PDC2 <- normalize(PG2[1:11]) PDC2 <-as.data.frame(lapply(PG2[1:11], normalize))
#list column rows and check for missing data, then delete the rows with missing data and confirm missing data rows = 0
```r
#list column names
colnames(PG2)
## [1] "ID" "X9Profit" "X9Online" "X9Age" "X9Inc"
## [6] "X9Tenure" "X9District" "X0Profit" "X0Online" "X9Billpay"
## [11] "X0Billpay"
sum(!complete.cases(PG2))
## [1] 10551
PDC2 <- na.omit(PG2)
sum(!complete.cases(PDC2))
## [1] 0
PDC2[PDC2$X9Online == 1, "X9Online"] <- "Y"
PDC2[PDC2$X9Online == 0, "X9Online"] <- "N"
#check variable types and make sure profit is numeric and online is character. If not change (MAYBE need to change age, inc, billpay for regression)
sapply(PDC2, class)
## ID X9Profit X9Online X9Age X9Inc X9Tenure
## "integer" "integer" "character" "integer" "integer" "numeric"
## X9District X0Profit X0Online X9Billpay X0Billpay
## "integer" "integer" "integer" "integer" "integer"
PDC2$X9Profit <-as.numeric(PDC2$X9Profit)
PDC2$X9Online <-as.character(PDC2$X9Online)
#delete columns from 2000 data, incomplete yr.
PDC2 <-select(PDC2, -c(8, 9, 11))
View(PDC2)
#boxplot
BP <- boxplot(PDC2$X9Profit ~ PDC2$X9Online)
#Ho: mean online = mean not online #two sided test
#check variances (not working) #install.packages(“car”) #library(car) #levenes test #leveneTest(y=PDC2\(X9Profit, group = PDC2\)X9Online) #check variances, if not equal set var.eq tp F in t test
var(PDC2$X9Profit[PDC2$X9Online=="Y"])
## [1] 86823.35
var(PDC2$X9Profit[PDC2$X9Online=="N"])
## [1] 80531.39
t.test(PDC2$X9Profit ~ PDC2$X9Online, mu=0, alt="two.sided", conf=0.95, var.eq=F, paired=F)
##
## Welch Two Sample t-test
##
## data: PDC2$X9Profit by PDC2$X9Online
## t = -1.2909, df = 3494.6, p-value = 0.1968
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -19.61678 4.04068
## sample estimates:
## mean in group N mean in group Y
## 128.8771 136.6652
#try without outliers, check for Online = Y first.
#remove outliers from X9online = y
library(stats)
quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.25 )
## 25%
## -40
quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.75 )
## 75%
## 222
quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.50 )
## 50%
## 31
IQR(PDC2$X9Profit[PDC2$X9Online=="Y"])
## [1] 262
fivenum(PDC2$X9Profit[PDC2$X9Online=="Y"])
## [1] -220 -40 31 222 1979
#outliers are 1.5 IQR away for Q1and QR
#find range to add and subtract from Q1and Q3
OutlierRange <-IQR(PDC2$X9Profit[PDC2$X9Online=="Y"]) * 1.5
print(OutlierRange)
## [1] 393
#find lower boundary
OutlierLower <- quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.25 ) - OutlierRange
print(OutlierLower)
## 25%
## -433
#find upper boundary
OutlierUpper <- quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.75 ) + OutlierRange
print(OutlierUpper)
## 75%
## 615
#remove lines with Y and profit less than -433
OutlierLower<-as.numeric(OutlierLower)
d<-PDC2[!(PDC2$X9Online=="Y" & PDC2$X9Profit < OutlierLower),]
d<-d[!(d$X9Online=="Y" & d$X9Profit > OutlierUpper),]
#do the same for X9Online = N
quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.25 )
## 25%
## -33
quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.75 )
## 75%
## 200
quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.50 )
## 50%
## 23
IQR(PDC2$X9Profit[PDC2$X9Online=="N"])
## [1] 233
fivenum(PDC2$X9Profit[PDC2$X9Online=="N"])
## [1] -221 -33 23 200 2071
#outliers are 1.5 IQR away for Q1and QR
#find range to add and subtract from Q1and Q3
OutlierRange2 <-IQR(PDC2$X9Profit[PDC2$X9Online=="N"]) * 1.5
print(OutlierRange2)
## [1] 349.5
#find lower boundary for N
OutlierLower2 <- quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.25 ) - OutlierRange2
print(OutlierLower2)
## 25%
## -382.5
#find upper boundary for N
OutlierUpper2 <- quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.75 ) + OutlierRange2
print(OutlierUpper2)
## 75%
## 549.5
#remove lines with N and profit less than outlierlower2 or greater than outlierupper2
OutlierLower2<-as.numeric(OutlierLower2)
d<-d[!(d$X9Online=="N" & d$X9Profit < OutlierLower2),]
d<-d[!(d$X9Online=="N" & d$X9Profit > OutlierUpper2),]
#new boxplot without outliers
BP2 <- boxplot(d$X9Profit ~ d$X9Online)
#Ho: mean online = mean not online #two sided test, CI: 95%, check variances
var(d$X9Profit[d$X9Online=="Y"])
## [1] 31885.04
var(d$X9Profit[d$X9Online=="N"])
## [1] 24302.32
t.test(d$X9Profit ~ d$X9Online, mu=0, alt="two.sided", conf=0.95, var.eq=F, paired=F)
##
## Welch Two Sample t-test
##
## data: d$X9Profit by d$X9Online
## t = -2.57, df = 3115.8, p-value = 0.01022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -17.004022 -2.286535
## sample estimates:
## mean in group N mean in group Y
## 64.65234 74.29762
#the outliers have a significant effect on the data, other factors should be observed.
linear<- lm(d$X9Profit ~ d$X9Online, d)
summary(linear)
##
## Call:
## lm(formula = d$X9Profit ~ d$X9Online, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -294.30 -103.65 -54.65 80.35 540.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.652 1.221 52.955 <2e-16 ***
## d$X9OnlineY 9.645 3.395 2.841 0.0045 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 159 on 19480 degrees of freedom
## Multiple R-squared: 0.0004143, Adjusted R-squared: 0.000363
## F-statistic: 8.073 on 1 and 19480 DF, p-value: 0.004497
linearPDC2<- lm(PDC2$X9Profit ~ PDC2$X9Online, PDC2)
summary(linearPDC2)
##
## Call:
## lm(formula = PDC2$X9Profit ~ PDC2$X9Online, data = PDC2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -356.67 -162.88 -105.88 73.12 1942.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 128.877 2.104 61.248 <2e-16 ***
## PDC2$X9OnlineY 7.788 5.867 1.327 0.184
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 285.2 on 21081 degrees of freedom
## Multiple R-squared: 8.358e-05, Adjusted R-squared: 3.615e-05
## F-statistic: 1.762 on 1 and 21081 DF, p-value: 0.1844
#calculate information gain
library(FSelector)
dIG <-information.gain(X9Profit~., PDC2, unit ="log2")
print(dIG)
## attr_importance
## ID 0.000000000
## X9Online 0.001265253
## X9Age 0.021966304
## X9Inc 0.023204363
## X9Tenure 0.036181518
## X9District 0.002020581
## X9Billpay 0.001774466
#cheak category rankings with outliers included
linear3 <-lm(PDC2$X9Profit ~PDC2$X9Online +PDC2$X9Age +PDC2$X9Inc +PDC2$X9Tenure +PDC2$X9District +PDC2$X9Billpay, PDC2)
summary(linear3)
##
## Call:
## lm(formula = PDC2$X9Profit ~ PDC2$X9Online + PDC2$X9Age + PDC2$X9Inc +
## PDC2$X9Tenure + PDC2$X9District + PDC2$X9Billpay, data = PDC2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -517.43 -160.80 -71.81 69.57 1949.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -102.02969 49.24514 -2.072 0.0383 *
## PDC2$X9OnlineY 7.26446 6.16834 1.178 0.2389
## PDC2$X9Age 18.49726 1.31409 14.076 < 2e-16 ***
## PDC2$X9Inc 18.41982 0.82545 22.315 < 2e-16 ***
## PDC2$X9Tenure 4.01748 0.24545 16.368 < 2e-16 ***
## PDC2$X9District 0.00579 0.04045 0.143 0.8862
## PDC2$X9Billpay 92.93172 15.39293 6.037 1.59e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.6 on 21076 degrees of freedom
## Multiple R-squared: 0.05972, Adjusted R-squared: 0.05945
## F-statistic: 223.1 on 6 and 21076 DF, p-value: < 2.2e-16
#looking at this data, we can see that oge, income, and tenure, and billpay have the largest affect on the profit #redo graph with those 4 main factors
linear4 <-lm(PDC2$X9Profit ~PDC2$X9Online +PDC2$X9Age +PDC2$X9Inc +PDC2$X9Tenure +PDC2$X9Billpay, PDC2)
summary(linear4)
##
## Call:
## lm(formula = PDC2$X9Profit ~ PDC2$X9Online + PDC2$X9Age + PDC2$X9Inc +
## PDC2$X9Tenure + PDC2$X9Billpay, data = PDC2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -517.47 -160.82 -71.88 69.58 1949.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -95.0562 7.2368 -13.135 < 2e-16 ***
## PDC2$X9OnlineY 7.2614 6.1682 1.177 0.239
## PDC2$X9Age 18.4917 1.3135 14.078 < 2e-16 ***
## PDC2$X9Inc 18.4226 0.8252 22.325 < 2e-16 ***
## PDC2$X9Tenure 4.0177 0.2454 16.370 < 2e-16 ***
## PDC2$X9Billpay 92.9417 15.3924 6.038 1.59e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.6 on 21077 degrees of freedom
## Multiple R-squared: 0.05972, Adjusted R-squared: 0.0595
## F-statistic: 267.7 on 5 and 21077 DF, p-value: < 2.2e-16
#testing above equation #divide data into two sections (1 & 2) with 67% in on and 33% in the other #name data set # call the first set of data with 67% bank.training, #second set of data (33%)
#divide data into two sections (1 & 2) with 67% in on and 33% in the other
#name data set
set.seed(46)
ind <-sample(2,nrow(PDC2), replace=TRUE,prob=c(0.67,0.33))
# call the first set of data with 67% bank.training,
PDC2.training <-PDC2[ind==1,1:8 ]
head(PDC2.training)
## ID X9Profit X9Online X9Age X9Inc X9Tenure X9District X9Billpay
## 2 2 -6 N 6 3 29.50 1200 0
## 3 3 -49 Y 5 5 26.41 1100 0
## 5 5 -61 N 2 9 9.91 1200 0
## 7 7 -19 N 3 1 8.41 1300 0
## 8 8 59 N 5 8 7.33 1200 0
## 9 9 493 N 4 9 15.33 1200 0
#second set of data (33%)
PDC2.test <- PDC2[ind==2,1:8]
dim(PDC2.test)
## [1] 6864 8
dim(PDC2.training)
## [1] 14219 8
lineartrain<-lm(X9Profit ~ +X9Age +X9Inc +X9Tenure +X9Billpay, PDC2.training)
summary(lineartrain)
##
## Call:
## lm(formula = X9Profit ~ +X9Age + X9Inc + X9Tenure + X9Billpay,
## data = PDC2.training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -523.84 -159.95 -72.34 69.10 1946.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -94.2339 8.6448 -10.901 < 2e-16 ***
## X9Age 18.8938 1.5769 11.982 < 2e-16 ***
## X9Inc 18.4796 0.9931 18.609 < 2e-16 ***
## X9Tenure 3.7452 0.2984 12.549 < 2e-16 ***
## X9Billpay 107.1946 17.2498 6.214 5.3e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.9 on 14214 degrees of freedom
## Multiple R-squared: 0.05835, Adjusted R-squared: 0.05808
## F-statistic: 220.2 on 4 and 14214 DF, p-value: < 2.2e-16
pred <-predict(lineartrain, newdata=PDC2.test)
rmse <-sqrt(sum(((pred)-PDC2.test$X9Profit)^2)/length(PDC2.test$X9Profit))
c(RMSE = rmse, R2 = summary(lineartrain)$r.squared)
## RMSE R2
## 280.1134932 0.0583489
par(mfrow=c(1,1))
plot(PDC2.test$X9Profit,pred)
cor(PDC2.test$X9Profit,pred)
## [1] 0.2490875