library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
The dataset consists of number of applications received for firearms transfers, the rejections and rate of felonies from 1999 - 2010.
# reading the data and skipping first 12 lines for data cleaning
firearms<- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/background_check.csv", skip=12, stringsAsFactors = TRUE)
head(firearms)
## X received Number Percent X.1 Number.1 Percent.1 X.2 applications
## 1 NA NA NA
## 2 1999 8,621,000 204,000 2.4 % 148,000 73 % 17.2
## 3 2000 7,699,000 153,000 2.0 88,000 58 11.4
## 4 2001 7,958,000 151,000 1.9 87,000 58 10.9
## 5 2002 7,806,000 136,000 1.7 65,000 48 8.3
## 6 2003 7,831,000 126,000 1.6 53,000 42 6.8
# rename the columns of the dataset
names(firearms) = c("Year", "Applications", "Rejected", "pct_rejected","X.1", "felonies", "pct_felonies", "X.2", "Rate")
# the formatting of the csv file has additional empty columns, we will remove those columns
firearms <- firearms[, -5]
firearms <- firearms[, -7]
#removing the first row of the data set
firearms <- firearms[-c(1),]
head(firearms)
## Year Applications Rejected pct_rejected felonies pct_felonies Rate
## 2 1999 8,621,000 204,000 2.4 148,000 73 17.2
## 3 2000 7,699,000 153,000 2.0 88,000 58 11.4
## 4 2001 7,958,000 151,000 1.9 87,000 58 10.9
## 5 2002 7,806,000 136,000 1.7 65,000 48 8.3
## 6 2003 7,831,000 126,000 1.6 53,000 42 6.8
## 7 2004 8,084,000 126,000 1.6 53,000 42 6.6
# the column Rejected has vales that need to be converted into numeric values by removing unneccessary signs (",")
rejection<-as.numeric(sub("\\,","", firearms$Rejected))
rate<- firearms$Rate
#Creating a data frame for Analysis
data = data.frame(firearms$Year, firearms$Applications, rejection, firearms$pct_rejected, firearms$felonies, firearms$pct_felonies, rate)
data
## firearms.Year firearms.Applications rejection firearms.pct_rejected
## 1 1999 8,621,000 204000 2.4
## 2 2000 7,699,000 153000 2.0
## 3 2001 7,958,000 151000 1.9
## 4 2002 7,806,000 136000 1.7
## 5 2003 7,831,000 126000 1.6
## 6 2004 8,084,000 126000 1.6
## 7 2005 8,278,000 132000 1.6
## 8 2006 8,612,000 135000 1.6
## 9 2007 8,658,000 136000 1.6
## 10 2008\\b\\ 9,900,000 147000 1.5
## 11 2009 10,764,000 150000 1.4
## 12 2010 10,405,000 153000 1.5
## firearms.felonies firearms.pct_felonies rate
## 1 148,000 73 17.2
## 2 88,000 58 11.4
## 3 87,000 58 10.9
## 4 65,000 48 8.3
## 5 53,000 42 6.8
## 6 53,000 42 6.6
## 7 57,000 43 6.9
## 8 52,000 39 6.0
## 9 49,000 36 5.7
## 10 77,000 52 7.8
## 11 67,000 45 6.2
## 12 62,000 40 6.0
Rate of felonies per 1000 applications depends on number of applications rejected each year. Is there any correlation between two variables.
(or Can a linear model be created to predict one varible using other)
we need to find relation between rate and rejection. i.e whetehr one is predictor of the other variable. The most common and effective way is checking for correlation between these two variables to check whether it is appropriate to create a linear regression model or not.
cor(data$rejection,data$rate)
## [1] 0.852463
strong correlation between these two variables. We will check this further by creating a scatter plot and checking if it is a straight line
plot(rate, rejection)
Althoughthe scatterplot does not predict any strong correlation, yet we will proceed with the regression model
model1<- lm(formula = rejection ~ rate , data = data)
summary(model1)
##
## Call:
## lm(formula = rejection ~ rate, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11711 -9235 -2356 5881 19530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101667 9169 11.089 6.12e-07 ***
## rate 5300 1028 5.156 0.000427 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11480 on 10 degrees of freedom
## Multiple R-squared: 0.7267, Adjusted R-squared: 0.6994
## F-statistic: 26.59 on 1 and 10 DF, p-value: 0.0004274
# model
# rejection = 101667 + 5301* rate
Dataset consists of oil consumption details of various brands in the first half of 2015 along with the opening balances.
# reading the data
oil<- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/oil_data.csv", na.strings=FALSE, skip = 2)
oil
## Month Category Caltex Gulf Mobil
## 1 Open Engine Oil 140 : 000 199 : 000 141 : 000
## 2 GearBox Oil 198 : 000 132 : 000 121 : 000
## 3 Jan Engine Oil 170 : 103 194 : 132 109 : 127
## 4 GearBox Oil 132 : 106 125 : 105 191 : 100
## 5 Feb Engine Oil 112 : 133 138 : 113 171 : 101
## 6 GearBox Oil 193 : 148 199 : 119 134 : 127
## 7 Mar Engine Oil 184 : 100 141 : 141 114 : 108
## 8 GearBox Oil 138 : 121 172 : 133 193 : 115
## 9 Apr Engine Oil 149 : 150 117 : 118 117 : 118
## 10 GearBox Oil 185 : 125 191 : 133 119 : 121
## 11 May Engine Oil 170 : 139 104 : 119 200 : 117
## 12 GearBox Oil 168 : 117 138 : 102 121 : 146
## 13 Jun Engine Oil 159 : 129 170 : 138 169 : 105
## 14 GearBox Oil 107 : 129 195 : 141 141 : 112
# name the missing headers
oil[2,1]<-c("Open")
oil[4,1]<-c("Jan")
oil[6,1]<-c("Feb")
oil[8,1]<-c("Mar")
oil[10,1]<-c("Apr")
oil[12,1]<-c("May")
oil[14,1]<-c("Jun")
#Extracting the data and converting to numeric
a<-(sub("\\:", "", oil$Caltex))
b<-(sub("\\:", "", oil$Gulf))
c<- (sub("\\:", "", oil$Mobil))
x<-as.numeric(unlist(str_extract_all(a,"[:digit:]{2,}")))
y<-as.numeric(unlist(str_extract_all(b,"[:digit:]{2,}")))
z<-as.numeric(unlist(str_extract_all(c,"[:digit:]{2,}")))
# from brand name we swlect alternate values for purchase and consumption after #extracting them
Caltex_pur <-x[c(1,3,5,7,9,11,13,15,17,19,21,23,25,27)]
Caltex_cons <-x[c(2,4,6,8,10,12,14,16,18,20,22,24,26,28)]
Gulf_pur<- y[c(1,3,5,7,9,11,13,15,17,19,21,23,25,27)]
Gulf_cons<-y[c(2,4,6,8,10,12,14,16,18,20,22,24,26,28)]
Mobil_pur<-z[c(1,3,5,7,9,11,13,15,17,19,21,23,25,27)]
Mobil_cons<-z[c(2,4,6,8,10,12,14,16,18,20,22,24,26,28)]
oil_data <-data.frame(oil$Month, oil$Category, Caltex_pur,Caltex_cons,Gulf_pur,Gulf_cons,Mobil_pur,Mobil_cons)
oil_data
## oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1 Open Engine Oil 140 0 199 0
## 2 Open GearBox Oil 198 0 132 0
## 3 Jan Engine Oil 170 103 194 132
## 4 Jan GearBox Oil 132 106 125 105
## 5 Feb Engine Oil 112 133 138 113
## 6 Feb GearBox Oil 193 148 199 119
## 7 Mar Engine Oil 184 100 141 141
## 8 Mar GearBox Oil 138 121 172 133
## 9 Apr Engine Oil 149 150 117 118
## 10 Apr GearBox Oil 185 125 191 133
## 11 May Engine Oil 170 139 104 119
## 12 May GearBox Oil 168 117 138 102
## 13 Jun Engine Oil 159 129 170 138
## 14 Jun GearBox Oil 107 129 195 141
## Mobil_pur Mobil_cons
## 1 141 0
## 2 121 0
## 3 109 127
## 4 191 100
## 5 171 101
## 6 134 127
## 7 114 108
## 8 193 115
## 9 117 118
## 10 119 121
## 11 200 117
## 12 121 146
## 13 169 105
## 14 141 112
#closing balance of brand
oil_data1 =oil_data%>%
mutate(Caltex_Close= Caltex_pur - Caltex_cons)
oil_data2 = oil_data1 %>%
mutate(Gulf_Close = Gulf_pur - Gulf_cons)
oil_data3 = oil_data2 %>%
mutate(Mobil_Close = Mobil_pur-Mobil_cons)
oil_data3
## oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1 Open Engine Oil 140 0 199 0
## 2 Open GearBox Oil 198 0 132 0
## 3 Jan Engine Oil 170 103 194 132
## 4 Jan GearBox Oil 132 106 125 105
## 5 Feb Engine Oil 112 133 138 113
## 6 Feb GearBox Oil 193 148 199 119
## 7 Mar Engine Oil 184 100 141 141
## 8 Mar GearBox Oil 138 121 172 133
## 9 Apr Engine Oil 149 150 117 118
## 10 Apr GearBox Oil 185 125 191 133
## 11 May Engine Oil 170 139 104 119
## 12 May GearBox Oil 168 117 138 102
## 13 Jun Engine Oil 159 129 170 138
## 14 Jun GearBox Oil 107 129 195 141
## Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 1 141 0 140 199 141
## 2 121 0 198 132 121
## 3 109 127 67 62 -18
## 4 191 100 26 20 91
## 5 171 101 -21 25 70
## 6 134 127 45 80 7
## 7 114 108 84 0 6
## 8 193 115 17 39 78
## 9 117 118 -1 -1 -1
## 10 119 121 60 58 -2
## 11 200 117 31 -15 83
## 12 121 146 51 36 -25
## 13 169 105 30 32 64
## 14 141 112 -22 54 29
#Closing balance of three brands
closing_bal_Caltex<- sum(oil_data3$Caltex_Close)
closing_bal_Gulf<- sum(oil_data3$Gulf_Close)
closing_bal_Mobil<- sum(oil_data3$Mobil_Close)
closing_bal_Caltex
## [1] 705
closing_bal_Gulf
## [1] 721
closing_bal_Mobil
## [1] 644
#closing balance of components
Eng_Oil<- subset(oil_data3, oil_data3$oil.Category == "Engine Oil")
Gear_Oil<- subset(oil_data3, oil_data3$oil.Category == "GearBox Oil")
Eng_Oil
## oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1 Open Engine Oil 140 0 199 0
## 3 Jan Engine Oil 170 103 194 132
## 5 Feb Engine Oil 112 133 138 113
## 7 Mar Engine Oil 184 100 141 141
## 9 Apr Engine Oil 149 150 117 118
## 11 May Engine Oil 170 139 104 119
## 13 Jun Engine Oil 159 129 170 138
## Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 1 141 0 140 199 141
## 3 109 127 67 62 -18
## 5 171 101 -21 25 70
## 7 114 108 84 0 6
## 9 117 118 -1 -1 -1
## 11 200 117 31 -15 83
## 13 169 105 30 32 64
Gear_Oil
## oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 2 Open GearBox Oil 198 0 132 0
## 4 Jan GearBox Oil 132 106 125 105
## 6 Feb GearBox Oil 193 148 199 119
## 8 Mar GearBox Oil 138 121 172 133
## 10 Apr GearBox Oil 185 125 191 133
## 12 May GearBox Oil 168 117 138 102
## 14 Jun GearBox Oil 107 129 195 141
## Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 2 121 0 198 132 121
## 4 191 100 26 20 91
## 6 134 127 45 80 7
## 8 193 115 17 39 78
## 10 119 121 60 58 -2
## 12 121 146 51 36 -25
## 14 141 112 -22 54 29
#closing balance of Engine oil
cbal1<- sum(Eng_Oil$Caltex_Close) + sum(Eng_Oil$Gulf_Close)+
sum(Eng_Oil$Mobil_Close)
cbal1
## [1] 977
#closing balance of Gearbox oil
cbal2<-sum(Gear_Oil$Caltex_Close) + sum(Gear_Oil$Gulf_Close)+
sum(Gear_Oil$Mobil_Close)
cbal2
## [1] 1093
# Most consumed brand across two categories of oil
oil_data3
## oil.Month oil.Category Caltex_pur Caltex_cons Gulf_pur Gulf_cons
## 1 Open Engine Oil 140 0 199 0
## 2 Open GearBox Oil 198 0 132 0
## 3 Jan Engine Oil 170 103 194 132
## 4 Jan GearBox Oil 132 106 125 105
## 5 Feb Engine Oil 112 133 138 113
## 6 Feb GearBox Oil 193 148 199 119
## 7 Mar Engine Oil 184 100 141 141
## 8 Mar GearBox Oil 138 121 172 133
## 9 Apr Engine Oil 149 150 117 118
## 10 Apr GearBox Oil 185 125 191 133
## 11 May Engine Oil 170 139 104 119
## 12 May GearBox Oil 168 117 138 102
## 13 Jun Engine Oil 159 129 170 138
## 14 Jun GearBox Oil 107 129 195 141
## Mobil_pur Mobil_cons Caltex_Close Gulf_Close Mobil_Close
## 1 141 0 140 199 141
## 2 121 0 198 132 121
## 3 109 127 67 62 -18
## 4 191 100 26 20 91
## 5 171 101 -21 25 70
## 6 134 127 45 80 7
## 7 114 108 84 0 6
## 8 193 115 17 39 78
## 9 117 118 -1 -1 -1
## 10 119 121 60 58 -2
## 11 200 117 31 -15 83
## 12 121 146 51 36 -25
## 13 169 105 30 32 64
## 14 141 112 -22 54 29
cal<- as.numeric(max(oil_data3$Caltex_cons))
gulf<-max(oil_data3$Gulf_cons)
mobil<-max(oil_data3$Mobil_cons)
cal
## [1] 150
gulf
## [1] 141
mobil
## [1] 146
Caltex is the most consumed brand across two categories of oil
Find the total percent difference from the estimates (high and low) and the sold prices. (Sold Price - High Estimate)/High Estimate = % – There should be two answers: High Difference and Low Difference
# reading the data
auction <- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/auction.csv")
#removing unwanted columns in csv file
auction <- auction[ , c(-2)]
auction
## Lot Range Price
## 1 2001 $300 - $400 $283
## 2 2002 $80 - $120 $234
## 3 2003 $200 - $300 $357
#extracting the data
x<-(sub("\\-", "", auction$Range))
x
## [1] "$300 $400" "$80 $120" "$200 $300"
y<-(sub("\\$", "", auction$Price))
df <- c(auction$Lot, x,y)
Price <- as.numeric(y)
Year<-as.numeric(auction$Lot)
b1<- sub("\\$", "", x)
b2<-sub(" \\$", " ", b1)
b3<-unlist(str_extract_all(b2," [:digit:]{3}"))
hrange<-as.numeric(b3)
b4<-unlist(str_extract_all(b2, "[:digit:]{2,}"))
b5<- b4[c(1,3,5)]
lrange<- as.numeric(b5)
# creating the data frame
auc <- data.frame(Year, lrange,hrange,Price)
auc
## Year lrange hrange Price
## 1 2001 300 400 283
## 2 2002 80 120 234
## 3 2003 200 300 357
Find the total percent difference from the estimates (high and low) and the sold prices. (Sold Price - High Estimate)/High Estimate = %
# adding another columns as required in Analysis
auct = auc%>%
mutate(`High Difference` = ((Price-hrange)/hrange)*100)
auc1 = auct%>%
mutate(`Low Difference`= ((Price-lrange)/lrange)*100)
auc1
## Year lrange hrange Price High Difference Low Difference
## 1 2001 300 400 283 -29.25 -5.666667
## 2 2002 80 120 234 95.00 192.500000
## 3 2003 200 300 357 19.00 78.500000