#set directory #create data table , then rename so you maintain the original #view data #install libraries

setwd("/Users/carolyn.khalil/Desktop/R-Tutorial")
PG2<- read.csv("R-tutorial/Data/PilgrimData.csv")

head(PG2)
##   ID X9Profit X9Online X9Age X9Inc X9Tenure X9District X0Profit X0Online
## 1  1       21        0    NA    NA     6.33       1200       NA       NA
## 2  2       -6        0     6     3    29.50       1200      -32        0
## 3  3      -49        1     5     5    26.41       1100      -22        1
## 4  4       -4        0    NA    NA     2.25       1200       NA       NA
## 5  5      -61        0     2     9     9.91       1200       -4        0
## 6  6      -38        0    NA     3     2.33       1300       14        0
##   X9Billpay X0Billpay
## 1         0        NA
## 2         0         0
## 3         0         0
## 4         0        NA
## 5         0         0
## 6         0         0
summary(PG2)
##        ID           X9Profit         X9Online          X9Age      
##  Min.   :    1   Min.   :-221.0   Min.   :0.0000   Min.   :1.000  
##  1st Qu.: 7909   1st Qu.: -34.0   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :15818   Median :   9.0   Median :0.0000   Median :4.000  
##  Mean   :15818   Mean   : 111.5   Mean   :0.1218   Mean   :4.046  
##  3rd Qu.:23726   3rd Qu.: 164.0   3rd Qu.:0.0000   3rd Qu.:5.000  
##  Max.   :31634   Max.   :2071.0   Max.   :1.0000   Max.   :7.000  
##                                                    NA's   :8289   
##      X9Inc          X9Tenure       X9District      X0Profit      
##  Min.   :1.000   Min.   : 0.16   Min.   :1100   Min.   :-5643.0  
##  1st Qu.:4.000   1st Qu.: 3.75   1st Qu.:1200   1st Qu.:  -30.0  
##  Median :6.000   Median : 7.41   Median :1200   Median :   23.0  
##  Mean   :5.459   Mean   :10.16   Mean   :1203   Mean   :  144.8  
##  3rd Qu.:7.000   3rd Qu.:14.75   3rd Qu.:1200   3rd Qu.:  206.0  
##  Max.   :9.000   Max.   :41.16   Max.   :1300   Max.   :27086.0  
##  NA's   :8261                                   NA's   :5238     
##     X0Online       X9Billpay         X0Billpay   
##  Min.   :0.000   Min.   :0.00000   Min.   :0.00  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00  
##  Median :0.000   Median :0.00000   Median :0.00  
##  Mean   :0.199   Mean   :0.01669   Mean   :0.03  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.00  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.00  
##  NA's   :5219                      NA's   :5219
library(stats)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(base)
library(ggplot2)

#what we can see here is ….. (data types)

#normalize function ``` { r} normalize <- function(x) { num <- x - min(x) denom <- max(x) - min(x) return (num/denom) }

#Normalization PDC2 <- normalize(PG2[1:11]) PDC2 <-as.data.frame(lapply(PG2[1:11], normalize))



#list column rows and check for missing data, then delete the rows with missing data and confirm missing data rows = 0

```r
#list column names
colnames(PG2)
##  [1] "ID"         "X9Profit"   "X9Online"   "X9Age"      "X9Inc"     
##  [6] "X9Tenure"   "X9District" "X0Profit"   "X0Online"   "X9Billpay" 
## [11] "X0Billpay"
sum(!complete.cases(PG2))
## [1] 10551
PDC2 <- na.omit(PG2)
sum(!complete.cases(PDC2))
## [1] 0

change online usage 1 to Y and 0 to N

PDC2[PDC2$X9Online == 1, "X9Online"] <- "Y"
PDC2[PDC2$X9Online == 0, "X9Online"] <- "N"

#check variable types and make sure profit is numeric and online is character. If not change (MAYBE need to change age, inc, billpay for regression)

sapply(PDC2, class)
##          ID    X9Profit    X9Online       X9Age       X9Inc    X9Tenure 
##   "integer"   "integer" "character"   "integer"   "integer"   "numeric" 
##  X9District    X0Profit    X0Online   X9Billpay   X0Billpay 
##   "integer"   "integer"   "integer"   "integer"   "integer"
PDC2$X9Profit <-as.numeric(PDC2$X9Profit)
PDC2$X9Online <-as.character(PDC2$X9Online)

#delete columns from 2000 data, incomplete yr.

PDC2 <-select(PDC2, -c(8, 9, 11))
View(PDC2)

#boxplot

BP <- boxplot(PDC2$X9Profit ~ PDC2$X9Online)

t-test

#Ho: mean online = mean not online #two sided test

#check variances (not working) #install.packages(“car”) #library(car) #levenes test #leveneTest(y=PDC2\(X9Profit, group = PDC2\)X9Online) #check variances, if not equal set var.eq tp F in t test

var(PDC2$X9Profit[PDC2$X9Online=="Y"])
## [1] 86823.35
var(PDC2$X9Profit[PDC2$X9Online=="N"])
## [1] 80531.39
t.test(PDC2$X9Profit ~ PDC2$X9Online, mu=0, alt="two.sided", conf=0.95, var.eq=F, paired=F)
## 
##  Welch Two Sample t-test
## 
## data:  PDC2$X9Profit by PDC2$X9Online
## t = -1.2909, df = 3494.6, p-value = 0.1968
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -19.61678   4.04068
## sample estimates:
## mean in group N mean in group Y 
##        128.8771        136.6652

p value > a, therefore we fail to reject the null hypothesis. We cannot conclude there is a significant different between the profits of online and non-online user

#try without outliers, check for Online = Y first.

#remove outliers from X9online = y 
library(stats)
quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.25 )
## 25% 
## -40
quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.75 )
## 75% 
## 222
quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.50 )
## 50% 
##  31
IQR(PDC2$X9Profit[PDC2$X9Online=="Y"])
## [1] 262
fivenum(PDC2$X9Profit[PDC2$X9Online=="Y"])
## [1] -220  -40   31  222 1979
#outliers are 1.5 IQR away for Q1and QR
#find range to add and subtract from Q1and Q3
OutlierRange <-IQR(PDC2$X9Profit[PDC2$X9Online=="Y"]) * 1.5
print(OutlierRange)
## [1] 393
#find lower boundary 
OutlierLower <- quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.25 ) - OutlierRange
print(OutlierLower)
##  25% 
## -433
#find upper boundary 
OutlierUpper <- quantile(PDC2$X9Profit[PDC2$X9Online=="Y"],0.75 ) + OutlierRange
print(OutlierUpper)
## 75% 
## 615
#remove lines with Y and profit less than -433
OutlierLower<-as.numeric(OutlierLower)
d<-PDC2[!(PDC2$X9Online=="Y" & PDC2$X9Profit < OutlierLower),]
d<-d[!(d$X9Online=="Y" & d$X9Profit > OutlierUpper),]

remove outlier from Online = N

#do the same for X9Online = N
quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.25 )
## 25% 
## -33
quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.75 )
## 75% 
## 200
quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.50 )
## 50% 
##  23
IQR(PDC2$X9Profit[PDC2$X9Online=="N"])
## [1] 233
fivenum(PDC2$X9Profit[PDC2$X9Online=="N"])
## [1] -221  -33   23  200 2071
#outliers are 1.5 IQR away for Q1and QR
#find range to add and subtract from Q1and Q3
OutlierRange2 <-IQR(PDC2$X9Profit[PDC2$X9Online=="N"]) * 1.5
print(OutlierRange2)
## [1] 349.5
#find lower boundary  for N
OutlierLower2 <- quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.25 ) - OutlierRange2
print(OutlierLower2)
##    25% 
## -382.5
#find upper boundary  for N
OutlierUpper2 <- quantile(PDC2$X9Profit[PDC2$X9Online=="N"],0.75 ) + OutlierRange2
print(OutlierUpper2)
##   75% 
## 549.5
#remove lines with N and profit less than outlierlower2 or greater than outlierupper2
OutlierLower2<-as.numeric(OutlierLower2)
d<-d[!(d$X9Online=="N" & d$X9Profit < OutlierLower2),]
d<-d[!(d$X9Online=="N" & d$X9Profit > OutlierUpper2),]

#new boxplot without outliers

BP2 <- boxplot(d$X9Profit ~ d$X9Online)

new t-test

#Ho: mean online = mean not online #two sided test, CI: 95%, check variances

var(d$X9Profit[d$X9Online=="Y"])
## [1] 31885.04
var(d$X9Profit[d$X9Online=="N"])
## [1] 24302.32
t.test(d$X9Profit ~ d$X9Online, mu=0, alt="two.sided", conf=0.95, var.eq=F, paired=F)
## 
##  Welch Two Sample t-test
## 
## data:  d$X9Profit by d$X9Online
## t = -2.57, df = 3115.8, p-value = 0.01022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17.004022  -2.286535
## sample estimates:
## mean in group N mean in group Y 
##        64.65234        74.29762

p value < a without the outliers therefore we reject the null hypothesis, there is a difference in the means between online users and none online users.

as the outliers compose 1601/21083 customers (7.59%) of the customers, they should be included in the results to avoid a sample bias.

#the outliers have a significant effect on the data, other factors should be observed.

linear<- lm(d$X9Profit ~ d$X9Online, d)
summary(linear)
## 
## Call:
## lm(formula = d$X9Profit ~ d$X9Online, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -294.30 -103.65  -54.65   80.35  540.70 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   64.652      1.221  52.955   <2e-16 ***
## d$X9OnlineY    9.645      3.395   2.841   0.0045 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 159 on 19480 degrees of freedom
## Multiple R-squared:  0.0004143,  Adjusted R-squared:  0.000363 
## F-statistic: 8.073 on 1 and 19480 DF,  p-value: 0.004497
linearPDC2<- lm(PDC2$X9Profit ~ PDC2$X9Online, PDC2)
summary(linearPDC2)
## 
## Call:
## lm(formula = PDC2$X9Profit ~ PDC2$X9Online, data = PDC2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -356.67 -162.88 -105.88   73.12 1942.12 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     128.877      2.104  61.248   <2e-16 ***
## PDC2$X9OnlineY    7.788      5.867   1.327    0.184    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 285.2 on 21081 degrees of freedom
## Multiple R-squared:  8.358e-05,  Adjusted R-squared:  3.615e-05 
## F-statistic: 1.762 on 1 and 21081 DF,  p-value: 0.1844

#calculate information gain

library(FSelector)
dIG <-information.gain(X9Profit~., PDC2, unit ="log2")
print(dIG)
##            attr_importance
## ID             0.000000000
## X9Online       0.001265253
## X9Age          0.021966304
## X9Inc          0.023204363
## X9Tenure       0.036181518
## X9District     0.002020581
## X9Billpay      0.001774466

#cheak category rankings with outliers included

linear3 <-lm(PDC2$X9Profit ~PDC2$X9Online +PDC2$X9Age +PDC2$X9Inc +PDC2$X9Tenure +PDC2$X9District +PDC2$X9Billpay, PDC2)
summary(linear3)
## 
## Call:
## lm(formula = PDC2$X9Profit ~ PDC2$X9Online + PDC2$X9Age + PDC2$X9Inc + 
##     PDC2$X9Tenure + PDC2$X9District + PDC2$X9Billpay, data = PDC2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -517.43 -160.80  -71.81   69.57 1949.13 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -102.02969   49.24514  -2.072   0.0383 *  
## PDC2$X9OnlineY     7.26446    6.16834   1.178   0.2389    
## PDC2$X9Age        18.49726    1.31409  14.076  < 2e-16 ***
## PDC2$X9Inc        18.41982    0.82545  22.315  < 2e-16 ***
## PDC2$X9Tenure      4.01748    0.24545  16.368  < 2e-16 ***
## PDC2$X9District    0.00579    0.04045   0.143   0.8862    
## PDC2$X9Billpay    92.93172   15.39293   6.037 1.59e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.6 on 21076 degrees of freedom
## Multiple R-squared:  0.05972,    Adjusted R-squared:  0.05945 
## F-statistic: 223.1 on 6 and 21076 DF,  p-value: < 2.2e-16

#looking at this data, we can see that oge, income, and tenure, and billpay have the largest affect on the profit #redo graph with those 4 main factors

linear4 <-lm(PDC2$X9Profit ~PDC2$X9Online +PDC2$X9Age +PDC2$X9Inc +PDC2$X9Tenure +PDC2$X9Billpay, PDC2)
summary(linear4)
## 
## Call:
## lm(formula = PDC2$X9Profit ~ PDC2$X9Online + PDC2$X9Age + PDC2$X9Inc + 
##     PDC2$X9Tenure + PDC2$X9Billpay, data = PDC2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -517.47 -160.82  -71.88   69.58 1949.11 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -95.0562     7.2368 -13.135  < 2e-16 ***
## PDC2$X9OnlineY   7.2614     6.1682   1.177    0.239    
## PDC2$X9Age      18.4917     1.3135  14.078  < 2e-16 ***
## PDC2$X9Inc      18.4226     0.8252  22.325  < 2e-16 ***
## PDC2$X9Tenure    4.0177     0.2454  16.370  < 2e-16 ***
## PDC2$X9Billpay  92.9417    15.3924   6.038 1.59e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.6 on 21077 degrees of freedom
## Multiple R-squared:  0.05972,    Adjusted R-squared:  0.0595 
## F-statistic: 267.7 on 5 and 21077 DF,  p-value: < 2.2e-16

equation = Bo + B1(Age) + B2(inc)+ B3(Tenure) +B4(billpay) where B0,B1,B2,B3,and B4 are the coefficients below in order

therefore y = 18.2857A + 18.4842I + 4.0118T +99.1950B - 93.6641

#testing above equation #divide data into two sections (1 & 2) with 67% in on and 33% in the other #name data set # call the first set of data with 67% bank.training, #second set of data (33%)

#divide data into two sections (1 & 2) with 67% in on and 33% in the other
#name data set
set.seed(46)
ind <-sample(2,nrow(PDC2), replace=TRUE,prob=c(0.67,0.33))

# call the first set of data with 67% bank.training, 
PDC2.training <-PDC2[ind==1,1:8 ]
head(PDC2.training)
##   ID X9Profit X9Online X9Age X9Inc X9Tenure X9District X9Billpay
## 2  2       -6        N     6     3    29.50       1200         0
## 3  3      -49        Y     5     5    26.41       1100         0
## 5  5      -61        N     2     9     9.91       1200         0
## 7  7      -19        N     3     1     8.41       1300         0
## 8  8       59        N     5     8     7.33       1200         0
## 9  9      493        N     4     9    15.33       1200         0
#second set of data (33%)
PDC2.test <- PDC2[ind==2,1:8] 

dim(PDC2.test)
## [1] 6864    8
dim(PDC2.training)
## [1] 14219     8
lineartrain<-lm(X9Profit ~ +X9Age +X9Inc +X9Tenure +X9Billpay, PDC2.training)
summary(lineartrain)
## 
## Call:
## lm(formula = X9Profit ~ +X9Age + X9Inc + X9Tenure + X9Billpay, 
##     data = PDC2.training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -523.84 -159.95  -72.34   69.10 1946.62 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -94.2339     8.6448 -10.901  < 2e-16 ***
## X9Age        18.8938     1.5769  11.982  < 2e-16 ***
## X9Inc        18.4796     0.9931  18.609  < 2e-16 ***
## X9Tenure      3.7452     0.2984  12.549  < 2e-16 ***
## X9Billpay   107.1946    17.2498   6.214  5.3e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.9 on 14214 degrees of freedom
## Multiple R-squared:  0.05835,    Adjusted R-squared:  0.05808 
## F-statistic: 220.2 on 4 and 14214 DF,  p-value: < 2.2e-16
pred <-predict(lineartrain, newdata=PDC2.test)
rmse <-sqrt(sum(((pred)-PDC2.test$X9Profit)^2)/length(PDC2.test$X9Profit))
c(RMSE = rmse, R2 = summary(lineartrain)$r.squared)                
##        RMSE          R2 
## 280.1134932   0.0583489
par(mfrow=c(1,1))
plot(PDC2.test$X9Profit,pred)                

cor(PDC2.test$X9Profit,pred) 
## [1] 0.2490875