Project 2

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

library(RCurl)

## Loading required package: bitops

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

DataSet 1

The dataset consists of number of applications received for firearms transfers, the rejections and rate of felonies from 1999 - 2010.

# reading the data and skipping first 12 lines for data cleaning
firearms<- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/background_check.csv", skip=12, stringsAsFactors = TRUE)
head(firearms)

##      X  received  Number Percent X.1 Number.1 Percent.1 X.2 applications
## 1                             NA                     NA               NA
## 2 1999 8,621,000 204,000     2.4   %  148,000        73   %         17.2
## 3 2000 7,699,000 153,000     2.0       88,000        58             11.4
## 4 2001 7,958,000 151,000     1.9       87,000        58             10.9
## 5 2002 7,806,000 136,000     1.7       65,000        48              8.3
## 6 2003 7,831,000 126,000     1.6       53,000        42              6.8

# rename the columns of the dataset
names(firearms) = c("Year", "Applications", "Rejected", "pct_rejected","X.1", "felonies", "pct_felonies", "X.2", "Rate")

# the formatting of the csv file has additional empty columns, we will remove those columns
firearms <- firearms[, -5]
firearms <- firearms[, -7]

#removing the first row of the data set 
firearms <- firearms[-c(1),]
head(firearms)

##   Year Applications Rejected pct_rejected felonies pct_felonies Rate
## 2 1999    8,621,000  204,000          2.4  148,000           73 17.2
## 3 2000    7,699,000  153,000          2.0   88,000           58 11.4
## 4 2001    7,958,000  151,000          1.9   87,000           58 10.9
## 5 2002    7,806,000  136,000          1.7   65,000           48  8.3
## 6 2003    7,831,000  126,000          1.6   53,000           42  6.8
## 7 2004    8,084,000  126,000          1.6   53,000           42  6.6

# the column Rejected has vales that need to be converted into numeric values by removing unneccessary signs (",")
rejection<-as.numeric(sub("\\,","", firearms$Rejected)) 

rate<- firearms$Rate


#Creating a data frame for Analysis
data = data.frame(firearms$Year, firearms$Applications, rejection, firearms$pct_rejected, firearms$felonies, firearms$pct_felonies, rate)
data

##    firearms.Year firearms.Applications rejection firearms.pct_rejected
## 1           1999             8,621,000    204000                   2.4
## 2           2000             7,699,000    153000                   2.0
## 3           2001             7,958,000    151000                   1.9
## 4           2002             7,806,000    136000                   1.7
## 5           2003             7,831,000    126000                   1.6
## 6           2004             8,084,000    126000                   1.6
## 7           2005             8,278,000    132000                   1.6
## 8           2006             8,612,000    135000                   1.6
## 9           2007             8,658,000    136000                   1.6
## 10     2008\\b\\             9,900,000    147000                   1.5
## 11          2009            10,764,000    150000                   1.4
## 12          2010            10,405,000    153000                   1.5
##    firearms.felonies firearms.pct_felonies rate
## 1            148,000                    73 17.2
## 2             88,000                    58 11.4
## 3             87,000                    58 10.9
## 4             65,000                    48  8.3
## 5             53,000                    42  6.8
## 6             53,000                    42  6.6
## 7             57,000                    43  6.9
## 8             52,000                    39  6.0
## 9             49,000                    36  5.7
## 10            77,000                    52  7.8
## 11            67,000                    45  6.2
## 12            62,000                    40  6.0

Analysis

Rate of felonies per 1000 applications depends on number of applications rejected each year. Is there any correlation between two variables.

(or Can a linear model be created to predict one varible using other)

we need to find relation between rate and rejection. i.e whetehr one is predictor of the other variable. The most common and effective way is checking for correlation between these two variables to check whether it is appropriate to create a linear regression model or not.

cor(data$rejection,data$rate)

## [1] 0.852463

strong correlation between these two variables. We will check this further by creating a scatter plot and checking if it is a straight line

plot(rate, rejection)

Althoughthe scatterplot does not predict any strong correlation, yet we will proceed with the regression model

model1<- lm(formula = rejection ~ rate  , data = data)
summary(model1)

## 
## Call:
## lm(formula = rejection ~ rate, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11711  -9235  -2356   5881  19530 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   101667       9169  11.089 6.12e-07 ***
## rate            5300       1028   5.156 0.000427 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11480 on 10 degrees of freedom
## Multiple R-squared:  0.7267, Adjusted R-squared:  0.6994 
## F-statistic: 26.59 on 1 and 10 DF,  p-value: 0.0004274

# model 
# rejection = 101667 + 5301* rate

DataSet 2

Dataset consists of oil consumption details of various brands in the first half of 2015 along with the opening balances.

# reading the data

oil<- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/oil_data.csv", na.strings=FALSE, skip = 2)

oil

##    Month    Category     Caltex       Gulf      Mobil
## 1   Open  Engine Oil 140 :  000 199 :  000 141 :  000
## 2        GearBox Oil 198 :  000 132 :  000 121 :  000
## 3    Jan  Engine Oil 170 :  103 194 :  132 109 :  127
## 4        GearBox Oil 132 :  106 125 :  105 191 :  100
## 5    Feb  Engine Oil 112 :  133 138 :  113 171 :  101
## 6        GearBox Oil 193 :  148 199 :  119 134 :  127
## 7    Mar  Engine Oil 184 :  100 141 :  141 114 :  108
## 8        GearBox Oil 138 :  121 172 :  133 193 :  115
## 9    Apr  Engine Oil 149 :  150 117 :  118 117 :  118
## 10       GearBox Oil 185 :  125 191 :  133 119 :  121
## 11   May  Engine Oil 170 :  139 104 :  119 200 :  117
## 12       GearBox Oil 168 :  117 138 :  102 121 :  146
## 13   Jun  Engine Oil 159 :  129 170 :  138 169 :  105
## 14       GearBox Oil 107 :  129 195 :  141 141 :  112

# name the missing headers

oil[2,1]<-c("Open")
oil[4,1]<-c("Jan")
oil[6,1]<-c("Feb")
oil[8,1]<-c("Mar")
oil[10,1]<-c("Apr")
oil[12,1]<-c("May")
oil[14,1]<-c("Jun")


#Extracting the data and converting to numeric

a<-(sub("\\:", "", oil$Caltex))
b<-(sub("\\:", "", oil$Gulf))
c<- (sub("\\:", "", oil$Mobil))

x<-as.numeric(unlist(str_extract_all(a,"[:digit:]{2,}")))
y<-as.numeric(unlist(str_extract_all(b,"[:digit:]{2,}")))
z<-as.numeric(unlist(str_extract_all(c,"[:digit:]{2,}")))

# from brand name we swlect alternate values for purchase and consumption after #extracting them
Caltex_pur <-x[c(1,3,5,7,9,11,13,15,17,19,21,23,25,27)]
Caltex_cons <-x[c(2,4,6,8,10,12,14,16,18,20,22,24,26,28)]

Gulf_pur<- y[c(1,3,5,7,9,11,13,15,17,19,21,23,25,27)]
Gulf_cons<-y[c(2,4,6,8,10,12,14,16,18,20,22,24,26,28)]


Mobil_pur<-z[c(1,3,5,7,9,11,13,15,17,19,21,23,25,27)]
Mobil_cons<-z[c(2,4,6,8,10,12,14,16,18,20,22,24,26,28)]

oil_data <-data.frame(oil$Month, oil$Category, Caltex_pur,Caltex_cons,Gulf_pur,Gulf_cons,Mobil_pur,Mobil_cons)
oil_data

##    oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1       Open   Engine Oil        140           0      199         0
## 2       Open  GearBox Oil        198           0      132         0
## 3        Jan   Engine Oil        170         103      194       132
## 4        Jan  GearBox Oil        132         106      125       105
## 5        Feb   Engine Oil        112         133      138       113
## 6        Feb  GearBox Oil        193         148      199       119
## 7        Mar   Engine Oil        184         100      141       141
## 8        Mar  GearBox Oil        138         121      172       133
## 9        Apr   Engine Oil        149         150      117       118
## 10       Apr  GearBox Oil        185         125      191       133
## 11       May   Engine Oil        170         139      104       119
## 12       May  GearBox Oil        168         117      138       102
## 13       Jun   Engine Oil        159         129      170       138
## 14       Jun  GearBox Oil        107         129      195       141
##    Mobil_pur Mobil_cons
## 1        141          0
## 2        121          0
## 3        109        127
## 4        191        100
## 5        171        101
## 6        134        127
## 7        114        108
## 8        193        115
## 9        117        118
## 10       119        121
## 11       200        117
## 12       121        146
## 13       169        105
## 14       141        112

Analysis

#closing balance of  brand

oil_data1 =oil_data%>%
  mutate(Caltex_Close= Caltex_pur - Caltex_cons)
oil_data2 = oil_data1 %>%
  mutate(Gulf_Close = Gulf_pur - Gulf_cons)

oil_data3 = oil_data2 %>%
  mutate(Mobil_Close = Mobil_pur-Mobil_cons)
oil_data3

##    oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1       Open   Engine Oil        140           0      199         0
## 2       Open  GearBox Oil        198           0      132         0
## 3        Jan   Engine Oil        170         103      194       132
## 4        Jan  GearBox Oil        132         106      125       105
## 5        Feb   Engine Oil        112         133      138       113
## 6        Feb  GearBox Oil        193         148      199       119
## 7        Mar   Engine Oil        184         100      141       141
## 8        Mar  GearBox Oil        138         121      172       133
## 9        Apr   Engine Oil        149         150      117       118
## 10       Apr  GearBox Oil        185         125      191       133
## 11       May   Engine Oil        170         139      104       119
## 12       May  GearBox Oil        168         117      138       102
## 13       Jun   Engine Oil        159         129      170       138
## 14       Jun  GearBox Oil        107         129      195       141
##    Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 1        141          0          140        199         141
## 2        121          0          198        132         121
## 3        109        127           67         62         -18
## 4        191        100           26         20          91
## 5        171        101          -21         25          70
## 6        134        127           45         80           7
## 7        114        108           84          0           6
## 8        193        115           17         39          78
## 9        117        118           -1         -1          -1
## 10       119        121           60         58          -2
## 11       200        117           31        -15          83
## 12       121        146           51         36         -25
## 13       169        105           30         32          64
## 14       141        112          -22         54          29

#Closing balance of three brands
closing_bal_Caltex<- sum(oil_data3$Caltex_Close)
closing_bal_Gulf<- sum(oil_data3$Gulf_Close)
closing_bal_Mobil<- sum(oil_data3$Mobil_Close)

closing_bal_Caltex

## [1] 705

closing_bal_Gulf

## [1] 721

closing_bal_Mobil

## [1] 644

#closing balance of components

Eng_Oil<- subset(oil_data3, oil_data3$oil.Category == "Engine Oil")
Gear_Oil<- subset(oil_data3, oil_data3$oil.Category == "GearBox Oil")
Eng_Oil

##    oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1       Open   Engine Oil        140           0      199         0
## 3        Jan   Engine Oil        170         103      194       132
## 5        Feb   Engine Oil        112         133      138       113
## 7        Mar   Engine Oil        184         100      141       141
## 9        Apr   Engine Oil        149         150      117       118
## 11       May   Engine Oil        170         139      104       119
## 13       Jun   Engine Oil        159         129      170       138
##    Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 1        141          0          140        199         141
## 3        109        127           67         62         -18
## 5        171        101          -21         25          70
## 7        114        108           84          0           6
## 9        117        118           -1         -1          -1
## 11       200        117           31        -15          83
## 13       169        105           30         32          64

Gear_Oil

##    oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 2       Open  GearBox Oil        198           0      132         0
## 4        Jan  GearBox Oil        132         106      125       105
## 6        Feb  GearBox Oil        193         148      199       119
## 8        Mar  GearBox Oil        138         121      172       133
## 10       Apr  GearBox Oil        185         125      191       133
## 12       May  GearBox Oil        168         117      138       102
## 14       Jun  GearBox Oil        107         129      195       141
##    Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 2        121          0          198        132         121
## 4        191        100           26         20          91
## 6        134        127           45         80           7
## 8        193        115           17         39          78
## 10       119        121           60         58          -2
## 12       121        146           51         36         -25
## 14       141        112          -22         54          29

#closing balance of Engine oil

cbal1<- sum(Eng_Oil$Caltex_Close) + sum(Eng_Oil$Gulf_Close)+
                sum(Eng_Oil$Mobil_Close)
cbal1

## [1] 977

#closing balance of Gearbox oil

cbal2<-sum(Gear_Oil$Caltex_Close) + sum(Gear_Oil$Gulf_Close)+
                sum(Gear_Oil$Mobil_Close)
cbal2

## [1] 1093

# Most consumed brand across two categories of oil

oil_data3

##    oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1       Open   Engine Oil        140           0      199         0
## 2       Open  GearBox Oil        198           0      132         0
## 3        Jan   Engine Oil        170         103      194       132
## 4        Jan  GearBox Oil        132         106      125       105
## 5        Feb   Engine Oil        112         133      138       113
## 6        Feb  GearBox Oil        193         148      199       119
## 7        Mar   Engine Oil        184         100      141       141
## 8        Mar  GearBox Oil        138         121      172       133
## 9        Apr   Engine Oil        149         150      117       118
## 10       Apr  GearBox Oil        185         125      191       133
## 11       May   Engine Oil        170         139      104       119
## 12       May  GearBox Oil        168         117      138       102
## 13       Jun   Engine Oil        159         129      170       138
## 14       Jun  GearBox Oil        107         129      195       141
##    Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 1        141          0          140        199         141
## 2        121          0          198        132         121
## 3        109        127           67         62         -18
## 4        191        100           26         20          91
## 5        171        101          -21         25          70
## 6        134        127           45         80           7
## 7        114        108           84          0           6
## 8        193        115           17         39          78
## 9        117        118           -1         -1          -1
## 10       119        121           60         58          -2
## 11       200        117           31        -15          83
## 12       121        146           51         36         -25
## 13       169        105           30         32          64
## 14       141        112          -22         54          29

cal<- as.numeric(max(oil_data3$Caltex_cons))
gulf<-max(oil_data3$Gulf_cons)
mobil<-max(oil_data3$Mobil_cons)
cal

## [1] 150

gulf

## [1] 141

mobil

## [1] 146

Caltex is the most consumed brand across two categories of oil

Dataset 3

Find the total percent difference from the estimates (high and low) and the sold prices. (Sold Price - High Estimate)/High Estimate = % – There should be two answers: High Difference and Low Difference

# reading the data
auction <- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/auction.csv")

#removing unwanted columns in csv file
auction <- auction[ , c(-2)]
auction

##    Lot       Range Price
## 1 2001 $300 - $400 $283 
## 2 2002  $80 - $120 $234 
## 3 2003 $200 - $300 $357

#extracting the data 
x<-(sub("\\-", "", auction$Range)) 
x

## [1] "$300  $400" "$80  $120"  "$200  $300"

y<-(sub("\\$", "", auction$Price))
df <- c(auction$Lot, x,y)
Price <- as.numeric(y)
Year<-as.numeric(auction$Lot)


b1<- sub("\\$", "", x)
b2<-sub("  \\$", " ", b1)

b3<-unlist(str_extract_all(b2," [:digit:]{3}"))
hrange<-as.numeric(b3)
b4<-unlist(str_extract_all(b2, "[:digit:]{2,}"))

b5<- b4[c(1,3,5)]
lrange<- as.numeric(b5)
# creating the data frame

auc <- data.frame(Year, lrange,hrange,Price)

auc

##   Year lrange hrange Price
## 1 2001    300    400   283
## 2 2002     80    120   234
## 3 2003    200    300   357

Analysis

Find the total percent difference from the estimates (high and low) and the sold prices. (Sold Price - High Estimate)/High Estimate = %

# adding another columns as required in Analysis

auct = auc%>%
  mutate(`High Difference` = ((Price-hrange)/hrange)*100)
auc1 = auct%>%
  mutate(`Low Difference`= ((Price-lrange)/lrange)*100)
auc1

##   Year lrange hrange Price High Difference Low Difference
## 1 2001    300    400   283          -29.25      -5.666667
## 2 2002     80    120   234           95.00     192.500000
## 3 2003    200    300   357           19.00      78.500000

Project 2

GP SINGH

March 8, 2016

DataSet 1

Analysis

DataSet 2

Analysis

Dataset 3

Analysis