First, we would read the 2 datasets
f1<-read.csv("financialdata.csv")
c1<-read.csv("candydata.csv")
Then, we used the structure function to see the 2 data, f1 and c1
str(f1)
## 'data.frame': 7562 obs. of 16 variables:
## $ Date : Factor w/ 7562 levels "1/10/1989","1/10/1990",..: 2375 488 510 532 554 575 22 44 66 88 ...
## $ IBM_Open : num 28.9 29.2 30.6 30.5 30.4 ...
## $ IBM_Close : num 28.9 30.2 30.5 30.7 30.8 ...
## $ IBM_Volume : int 4660000 7340400 8688800 6308400 5721200 8448000 7140000 9424000 8698400 5075200 ...
## $ XRX_Open : num 25.3 25.1 26.4 26.7 26.7 ...
## $ XRX_Close : num 24.9 26 26.7 26.8 27.2 ...
## $ XRX_Volume : int 862500 744500 986600 1148500 562600 839900 917600 1289000 635900 330800 ...
## $ DJI_Open : num 1927 1953 2056 2036 2020 ...
## $ DJI_Close : num 1939 2015 2032 2038 2052 ...
## $ DJI_Volume : int 15360000 20880000 27200000 18800000 21370000 27440000 23390000 23610000 24710000 16650000 ...
## $ SP_Open : num 248 247 256 259 259 ...
## $ GSPC_Close : num 247 256 259 259 261 ...
## $ GSPC_Volume: num 1.70e+08 1.82e+08 2.10e+08 1.70e+08 1.75e+08 ...
## $ NAS_Open : num 328 335 344 345 346 ...
## $ NAS_Close : num 330 338 344 347 350 ...
## $ NAS_Volume : num 1.60e+08 1.11e+08 1.49e+08 1.36e+08 1.37e+08 ...
str(c1)
## 'data.frame': 85 obs. of 13 variables:
## $ competitorname : Factor w/ 85 levels "100 Grand","3 Musketeers",..: 1 2 45 46 3 4 5 6 7 8 ...
## $ chocolate : int 1 1 0 0 0 1 1 0 0 0 ...
## $ fruity : int 0 0 0 0 1 0 0 0 0 1 ...
## $ caramel : int 1 0 0 0 0 0 1 0 0 1 ...
## $ peanutyalmondy : int 0 0 0 0 0 1 1 1 0 0 ...
## $ nougat : int 0 1 0 0 0 0 1 0 0 0 ...
## $ crispedricewafer: int 1 0 0 0 0 0 0 0 0 0 ...
## $ hard : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bar : int 1 1 0 0 0 1 1 0 0 0 ...
## $ pluribus : int 0 0 0 0 0 0 0 1 1 0 ...
## $ sugarpercent : num 0.732 0.604 0.011 0.011 0.906 ...
## $ pricepercent : num 0.86 0.511 0.116 0.511 0.511 ...
## $ winpercent : num 67 67.6 32.3 46.1 52.3 ...
Before we isolated the data to only contain year 1993, we would change the Date column from factor to date, in order for dplyr to filter the all the dates in year 1993.
data.frame(f1$Date,stringsAsFactors = FALSE)
f1$Date<-as.Date(f1$Date,"%m/%d/%Y")
str(f1)
## 'data.frame': 7562 obs. of 16 variables:
## $ Date : Date, format: "1987-12-31" "1988-01-04" ...
## $ IBM_Open : num 28.9 29.2 30.6 30.5 30.4 ...
## $ IBM_Close : num 28.9 30.2 30.5 30.7 30.8 ...
## $ IBM_Volume : int 4660000 7340400 8688800 6308400 5721200 8448000 7140000 9424000 8698400 5075200 ...
## $ XRX_Open : num 25.3 25.1 26.4 26.7 26.7 ...
## $ XRX_Close : num 24.9 26 26.7 26.8 27.2 ...
## $ XRX_Volume : int 862500 744500 986600 1148500 562600 839900 917600 1289000 635900 330800 ...
## $ DJI_Open : num 1927 1953 2056 2036 2020 ...
## $ DJI_Close : num 1939 2015 2032 2038 2052 ...
## $ DJI_Volume : int 15360000 20880000 27200000 18800000 21370000 27440000 23390000 23610000 24710000 16650000 ...
## $ SP_Open : num 248 247 256 259 259 ...
## $ GSPC_Close : num 247 256 259 259 261 ...
## $ GSPC_Volume: num 1.70e+08 1.82e+08 2.10e+08 1.70e+08 1.75e+08 ...
## $ NAS_Open : num 328 335 344 345 346 ...
## $ NAS_Close : num 330 338 344 347 350 ...
## $ NAS_Volume : num 1.60e+08 1.11e+08 1.49e+08 1.36e+08 1.37e+08 ...
After we isolated the year 1993 from f1 dataset, we named the new dataset a1.
library(dplyr)
a1<-f1 %>% filter(Date >="1993-01-01"& Date <="1993-12-31")
str(a1)
## 'data.frame': 253 obs. of 16 variables:
## $ Date : Date, format: "1993-01-04" "1993-01-05" ...
## $ IBM_Open : num 12.9 12.5 12.2 12 11.7 ...
## $ IBM_Close : num 12.5 12.2 12 11.8 11.6 ...
## $ IBM_Volume : int 14225600 13935200 19358400 14734800 15790800 15498800 19464400 12833200 17147200 13492800 ...
## $ XRX_Open : num 34.8 36.5 37.3 37.4 36.9 ...
## $ XRX_Close : num 36.2 36.9 37.3 37.1 37 ...
## $ XRX_Volume : int 1378400 1206500 1558800 870700 727900 715200 662300 598100 648000 2082500 ...
## $ DJI_Open : num 3301 3309 3308 3305 3269 ...
## $ DJI_Close : num 3309 3308 3305 3269 3252 ...
## $ DJI_Volume : int 21400000 28060000 35790000 36820000 30690000 24500000 29660000 28950000 36150000 39270000 ...
## $ SP_Open : num 436 435 434 435 431 ...
## $ GSPC_Close : num 435 434 435 431 429 ...
## $ GSPC_Volume: num 2.01e+08 2.40e+08 2.95e+08 3.05e+08 2.63e+08 ...
## $ NAS_Open : num 675 671 678 682 677 ...
## $ NAS_Close : num 672 674 682 678 677 ...
## $ NAS_Volume : num 1.89e+08 2.29e+08 2.70e+08 2.97e+08 2.33e+08 ...
The code we used here was head (a1, )
head(a1,6) #To see the first 6 rows of data a1
## Date IBM_Open IBM_Close IBM_Volume XRX_Open XRX_Close XRX_Volume
## 1 1993-01-04 12.93750 12.53125 14225600 34.80457 36.17699 1378400
## 2 1993-01-05 12.50000 12.21875 13935200 36.45147 36.94554 1206500
## 3 1993-01-06 12.21875 12.00000 19358400 37.32982 37.32982 1558800
## 4 1993-01-07 12.00000 11.75000 14734800 37.38472 37.05534 870700
## 5 1993-01-08 11.68750 11.62500 15790800 36.89064 37.00044 727900
## 6 1993-01-11 11.68750 11.93750 15498800 36.56126 36.83575 715200
## DJI_Open DJI_Close DJI_Volume SP_Open GSPC_Close GSPC_Volume NAS_Open
## 1 3301.1 3309.20 21400000 435.70 435.38 201210000 675.31
## 2 3309.2 3307.90 28060000 435.38 434.34 240350000 670.52
## 3 3307.9 3305.20 35790000 434.34 434.52 295240000 678.44
## 4 3305.2 3269.00 36820000 434.52 430.73 304850000 681.56
## 5 3269.0 3251.70 30690000 430.73 429.05 263470000 677.19
## 6 3251.7 3262.75 24500000 429.04 430.95 217150000 677.46
## NAS_Close NAS_Volume
## 1 671.80 188870000
## 2 674.34 229400000
## 3 681.85 270020000
## 4 678.21 296930000
## 5 677.21 232820000
## 6 682.40 195740000
head(a1,12) #To see the first 12 rows of data a1
## Date IBM_Open IBM_Close IBM_Volume XRX_Open XRX_Close XRX_Volume
## 1 1993-01-04 12.93750 12.53125 14225600 34.80457 36.17699 1378400
## 2 1993-01-05 12.50000 12.21875 13935200 36.45147 36.94554 1206500
## 3 1993-01-06 12.21875 12.00000 19358400 37.32982 37.32982 1558800
## 4 1993-01-07 12.00000 11.75000 14734800 37.38472 37.05534 870700
## 5 1993-01-08 11.68750 11.62500 15790800 36.89064 37.00044 727900
## 6 1993-01-11 11.68750 11.93750 15498800 36.56126 36.83575 715200
## 7 1993-01-12 12.00000 12.18750 19464400 36.94554 37.49451 662300
## 8 1993-01-13 12.25000 11.93750 12833200 37.49451 37.43961 598100
## 9 1993-01-14 12.09375 12.18750 17147200 37.43961 37.38472 648000
## 10 1993-01-15 12.06250 12.06250 13492800 37.49451 38.04348 2082500
## 11 1993-01-18 12.21875 12.37500 11902000 38.15327 36.45147 1908500
## 12 1993-01-19 12.56250 12.09375 20156800 36.01230 36.89064 2183600
## DJI_Open DJI_Close DJI_Volume SP_Open GSPC_Close GSPC_Volume NAS_Open
## 1 3301.1 3309.20 21400000 435.70 435.38 201210000 675.31
## 2 3309.2 3307.90 28060000 435.38 434.34 240350000 670.52
## 3 3307.9 3305.20 35790000 434.34 434.52 295240000 678.44
## 4 3305.2 3269.00 36820000 434.52 430.73 304850000 681.56
## 5 3269.0 3251.70 30690000 430.73 429.05 263470000 677.19
## 6 3251.7 3262.75 24500000 429.04 430.95 217150000 677.46
## 7 3262.8 3264.60 29660000 430.95 431.04 239410000 681.44
## 8 3264.6 3263.60 28950000 431.03 433.03 245360000 679.21
## 9 3263.6 3267.90 36150000 433.08 435.94 281040000 690.32
## 10 3267.9 3271.10 39270000 435.87 437.15 309720000 692.46
## 11 3271.1 3274.90 23030000 437.13 436.84 196030000 694.69
## 12 3274.9 3256.00 35680000 436.84 435.13 283240000 698.38
## NAS_Close NAS_Volume
## 1 671.80 188870000
## 2 674.34 229400000
## 3 681.85 270020000
## 4 678.21 296930000
## 5 677.21 232820000
## 6 682.40 195740000
## 7 679.45 233770000
## 8 686.78 244700000
## 9 695.70 286960000
## 10 697.15 295440000
## 11 698.13 240740000
## 12 696.81 277890000
library(dplyr)
a2<- a1 %>% mutate(IBM_Change=IBM_Close-IBM_Open)
#The code was used to add a new column, IBM_Change to data a1
head(a2,10) #Using the head function to see if the new column has been added or not
## Date IBM_Open IBM_Close IBM_Volume XRX_Open XRX_Close XRX_Volume
## 1 1993-01-04 12.93750 12.53125 14225600 34.80457 36.17699 1378400
## 2 1993-01-05 12.50000 12.21875 13935200 36.45147 36.94554 1206500
## 3 1993-01-06 12.21875 12.00000 19358400 37.32982 37.32982 1558800
## 4 1993-01-07 12.00000 11.75000 14734800 37.38472 37.05534 870700
## 5 1993-01-08 11.68750 11.62500 15790800 36.89064 37.00044 727900
## 6 1993-01-11 11.68750 11.93750 15498800 36.56126 36.83575 715200
## 7 1993-01-12 12.00000 12.18750 19464400 36.94554 37.49451 662300
## 8 1993-01-13 12.25000 11.93750 12833200 37.49451 37.43961 598100
## 9 1993-01-14 12.09375 12.18750 17147200 37.43961 37.38472 648000
## 10 1993-01-15 12.06250 12.06250 13492800 37.49451 38.04348 2082500
## DJI_Open DJI_Close DJI_Volume SP_Open GSPC_Close GSPC_Volume NAS_Open
## 1 3301.1 3309.20 21400000 435.70 435.38 201210000 675.31
## 2 3309.2 3307.90 28060000 435.38 434.34 240350000 670.52
## 3 3307.9 3305.20 35790000 434.34 434.52 295240000 678.44
## 4 3305.2 3269.00 36820000 434.52 430.73 304850000 681.56
## 5 3269.0 3251.70 30690000 430.73 429.05 263470000 677.19
## 6 3251.7 3262.75 24500000 429.04 430.95 217150000 677.46
## 7 3262.8 3264.60 29660000 430.95 431.04 239410000 681.44
## 8 3264.6 3263.60 28950000 431.03 433.03 245360000 679.21
## 9 3263.6 3267.90 36150000 433.08 435.94 281040000 690.32
## 10 3267.9 3271.10 39270000 435.87 437.15 309720000 692.46
## NAS_Close NAS_Volume IBM_Change
## 1 671.80 188870000 -0.40625
## 2 674.34 229400000 -0.28125
## 3 681.85 270020000 -0.21875
## 4 678.21 296930000 -0.25000
## 5 677.21 232820000 -0.06250
## 6 682.40 195740000 0.25000
## 7 679.45 233770000 0.18750
## 8 686.78 244700000 -0.31250
## 9 695.70 286960000 0.09375
## 10 697.15 295440000 0.00000
a3<-a2 %>% mutate(XRX_Change=XRX_Close-XRX_Open,DJI_Change=DJI_Close-DJI_Open,NAS_Change=NAS_Close-NAS_Open,SP_Change=GSPC_Close-SP_Open) #Adding the 4 new columns,XRX_Change,DJI_Change,NAS_Change and SP_Change to the data a2
head(a3,10) #Using the head funtion to see whether the 4 new columns has been added in data a3 or not
## Date IBM_Open IBM_Close IBM_Volume XRX_Open XRX_Close XRX_Volume
## 1 1993-01-04 12.93750 12.53125 14225600 34.80457 36.17699 1378400
## 2 1993-01-05 12.50000 12.21875 13935200 36.45147 36.94554 1206500
## 3 1993-01-06 12.21875 12.00000 19358400 37.32982 37.32982 1558800
## 4 1993-01-07 12.00000 11.75000 14734800 37.38472 37.05534 870700
## 5 1993-01-08 11.68750 11.62500 15790800 36.89064 37.00044 727900
## 6 1993-01-11 11.68750 11.93750 15498800 36.56126 36.83575 715200
## 7 1993-01-12 12.00000 12.18750 19464400 36.94554 37.49451 662300
## 8 1993-01-13 12.25000 11.93750 12833200 37.49451 37.43961 598100
## 9 1993-01-14 12.09375 12.18750 17147200 37.43961 37.38472 648000
## 10 1993-01-15 12.06250 12.06250 13492800 37.49451 38.04348 2082500
## DJI_Open DJI_Close DJI_Volume SP_Open GSPC_Close GSPC_Volume NAS_Open
## 1 3301.1 3309.20 21400000 435.70 435.38 201210000 675.31
## 2 3309.2 3307.90 28060000 435.38 434.34 240350000 670.52
## 3 3307.9 3305.20 35790000 434.34 434.52 295240000 678.44
## 4 3305.2 3269.00 36820000 434.52 430.73 304850000 681.56
## 5 3269.0 3251.70 30690000 430.73 429.05 263470000 677.19
## 6 3251.7 3262.75 24500000 429.04 430.95 217150000 677.46
## 7 3262.8 3264.60 29660000 430.95 431.04 239410000 681.44
## 8 3264.6 3263.60 28950000 431.03 433.03 245360000 679.21
## 9 3263.6 3267.90 36150000 433.08 435.94 281040000 690.32
## 10 3267.9 3271.10 39270000 435.87 437.15 309720000 692.46
## NAS_Close NAS_Volume IBM_Change XRX_Change DJI_Change NAS_Change
## 1 671.80 188870000 -0.40625 1.372418 8.099853 -3.510010
## 2 674.34 229400000 -0.28125 0.494072 -1.300049 3.820007
## 3 681.85 270020000 -0.21875 0.000000 -2.699951 3.409974
## 4 678.21 296930000 -0.25000 -0.329380 -36.199951 -3.349976
## 5 677.21 232820000 -0.06250 0.109795 -17.300049 0.020020
## 6 682.40 195740000 0.25000 0.274487 11.050049 4.940002
## 7 679.45 233770000 0.18750 0.548970 1.800049 -1.989990
## 8 686.78 244700000 -0.31250 -0.054898 -1.000000 7.570007
## 9 695.70 286960000 0.09375 -0.054897 4.299804 5.380005
## 10 697.15 295440000 0.00000 0.548969 3.200196 4.690002
## SP_Change
## 1 -0.320007
## 2 -1.040009
## 3 0.179993
## 4 -3.789978
## 5 -1.680023
## 6 1.910003
## 7 0.089997
## 8 2.000000
## 9 2.860015
## 10 1.279999
The way we used mutate fuction in the previous task already attached the new columms to the dataframe. However, we could also use cbind function to attach the new 4 columns to the data a2.
a4<-cbind(a2,a3[ ,c("XRX_Change","DJI_Change","NAS_Change","SP_Change")])
# To reduce replications of columns only attach the new 4 colums to the data a2
a5<-a3 %>% summarise(AvergeIBM_Change=mean(IBM_Change),AverageXRX_Change=mean(XRX_Change),AverageDJI_Change=mean(DJI_Change),AverageNAS_Change=mean(NAS_Change),AverageSP_Change=mean(SP_Change))
#Using the summarise function in dplyr package to calculate the means for the 5 new columns
head(a5) #To see the results of average daily changes for the 5 new columns
## AvergeIBM_Change AverageXRX_Change AverageDJI_Change AverageNAS_Change
## 1 -0.02087451 0.02560416 1.783584 0.1654159
## AverageSP_Change
## 1 0.1247825
library(GGally)
ggpairs(a3,c(17:21),title="The Relationships among the Variables")
From the scatterplot mextrix, most of the variables had not strong non-linear relationships, except for the relationship between DJI_Change and SP_Change, and the relationship between NAS_Chagne and SP_Change. They both had strong positive linear relationships. Additionally, DJI_Change and SP_Change had a storng positive linear relationship (value between 0.7 and 1.0). XRX_Change and IBM_Change had a weak negative linear relationship (valuse between 0 and -0.3). The rest of relationships were moderate positive linear relationships (values between 0.3 and 0.7) and weak positive linear relationships (values between 0 and 0.3).
lm<-lm(IBM_Change ~ DJI_Change,data = a3)
summary(lm)
##
## Call:
## lm(formula = IBM_Change ~ DJI_Change, data = a3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95728 -0.11547 -0.00583 0.11253 0.55331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0242200 0.0136321 -1.777 0.07683 .
## DJI_Change 0.0018757 0.0007157 2.621 0.00931 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2159 on 251 degrees of freedom
## Multiple R-squared: 0.02664, Adjusted R-squared: 0.02276
## F-statistic: 6.869 on 1 and 251 DF, p-value: 0.009307
coef(lm)
## (Intercept) DJI_Change
## -0.024219996 0.001875712
-0.024219996+20*0.001875712 #IBM_Change=-0.024219996+20*DJI_Change
## [1] 0.01329424
-0.024219996+0*0.001875712 #IBM_Change=-0.024219996+0*DJI_Change
## [1] -0.02422
0.01329424-(-0.02422) #How much IBM had increaed when DJI had increased by 20 points
## [1] 0.03751424
If Dow Jones has increased by 20 points, we would expect IBM has increased by 0.0375 dollar.
lm2<-lm(XRX_Change ~ DJI_Change, data=a3)
summary(lm2)
##
## Call:
## lm(formula = XRX_Change ~ DJI_Change, data = a3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.73978 -0.22710 -0.01923 0.19554 1.66121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.014658 0.024861 0.590 0.556
## DJI_Change 0.006137 0.001305 4.702 4.25e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3937 on 251 degrees of freedom
## Multiple R-squared: 0.08095, Adjusted R-squared: 0.07729
## F-statistic: 22.11 on 1 and 251 DF, p-value: 4.253e-06
coef(lm2)
## (Intercept) DJI_Change
## 0.014657849 0.006137254
0.014657849+20*0.006137254 #XRX_Change=0.014657849+20*DJI_Change
## [1] 0.1374029
0.014657849+0*0.006137254 #XRX_Change=0.014657849+0*DJI_Change
## [1] 0.01465785
0.1374029-0.01465785 #How much IBM had increaed when DJI had increased by 20 points
## [1] 0.122745
If Dow Jones has increased by 20 points, we would expect XRX has increased by 0.1227 dollar.
The easier way to see how IBM would move, if DJI and XRX have increased by 20 points.
20*0.001875712
## [1] 0.03751424
20*0.006137254
## [1] 0.1227451
0.122745>0.03751424 Therofore, XRX had more influence on Dow Jones than IBM in year 1993.
set.seed(100)
row_data<-sample(1:nrow(a3), (140)/nrow(a3)*nrow(a3)) #Splitting the data a3 into 2 sets
train<-a3[row_data, ] #Setting up training set
valid<-a3[-row_data, ] #Setting up validation set
multi_regression<-lm(IBM_Change ~ DJI_Change+NAS_Change+SP_Change, data=train)
summary(multi_regression)
##
## Call:
## lm(formula = IBM_Change ~ DJI_Change + NAS_Change + SP_Change,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46955 -0.10418 -0.01938 0.09081 0.53979
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.015141 0.017007 -0.890 0.375
## DJI_Change 0.001354 0.001948 0.695 0.488
## NAS_Change 0.007139 0.006044 1.181 0.240
## SP_Change 0.002340 0.017636 0.133 0.895
##
## Residual standard error: 0.1998 on 136 degrees of freedom
## Multiple R-squared: 0.06048, Adjusted R-squared: 0.03976
## F-statistic: 2.918 on 3 and 136 DF, p-value: 0.03647
Multiple linear regression is used to describe data and to explain the relationship between one dependent variable and independent variales.It can used to identify the strength of the effect that the independent variables have on a dependent vriable. It can also be used to forecasst effects or impact of changes. In other words, multiple linear regression helps us to understand how much the dependent variable will change when we chagne the independent variables.
In this case, we can use multiple linear regression to see the relationship between the three independent variables, DJI_Chang, NAS_Change and SP_Change and one dependent variable, IBM_Change.
To interpret the direction of the relationship between the variables, we look at the signs, plus or minus of the coefficients. All the three coefficients are positive, so the relationships of these variables with the dependent variable are all positive. In addition, a low p-value(<0.05) indicates that we will reject the null hypothesis. In this case, the three predictor all have high p-values. It means the three predictor variables are not statistically significant because their p-values are greater than the usual significance levle of 0.05.
pred<-predict(multi_regression,valid)
pred
## 1 3 8 9 10
## -2.997806e-02 5.967032e-03 4.222754e-02 3.578372e-02 2.567073e-02
## 11 14 17 18 19
## 1.388474e-02 3.534528e-02 -2.558387e-02 -8.628245e-02 1.139360e-03
## 21 22 23 24 26
## 5.073624e-02 6.774708e-04 6.359798e-02 3.965445e-02 -4.541135e-02
## 27 29 31 32 33
## -8.962208e-02 -5.721808e-03 -2.780084e-01 -3.808863e-02 -7.158353e-02
## 35 36 37 38 39
## -4.868194e-02 -4.584036e-02 1.236531e-01 4.351830e-02 1.758578e-02
## 44 45 47 57 59
## 8.020676e-03 1.234516e-01 2.852468e-02 -2.742922e-02 -6.133732e-02
## 61 64 66 67 73
## 1.697368e-02 -1.859572e-01 -8.038966e-02 4.360586e-02 -2.944829e-03
## 74 75 77 79 81
## -4.268239e-02 -6.492878e-02 -3.742621e-02 -1.221658e-01 3.358257e-04
## 84 87 88 89 91
## 2.800403e-02 -5.458689e-02 -1.579496e-02 -3.402555e-03 3.494658e-03
## 93 94 95 100 101
## -9.967243e-03 1.980023e-05 -1.934096e-02 1.387030e-02 8.712141e-02
## 106 107 108 110 111
## -1.999471e-02 -2.971501e-02 -9.039326e-02 5.290857e-03 -5.090515e-02
## 114 116 122 130 131
## -6.005592e-02 -1.883588e-03 2.224035e-02 7.786057e-02 2.198013e-02
## 132 133 138 140 141
## -2.784892e-03 -2.383822e-02 2.953595e-02 -9.976802e-02 3.894184e-02
## 142 144 147 149 150
## 3.440010e-02 -6.671441e-03 2.782979e-02 5.103376e-03 -1.893584e-02
## 151 154 156 157 161
## 1.212108e-02 2.981940e-03 -7.517796e-03 4.340666e-02 -3.957908e-03
## 163 166 167 170 172
## 6.296957e-02 -8.525779e-03 5.411397e-03 -4.223054e-02 -1.282158e-01
## 176 178 180 186 188
## -2.539724e-02 7.241003e-02 -3.319136e-02 4.221377e-02 -3.542750e-02
## 189 190 191 192 195
## -1.512593e-02 3.708006e-02 -1.031515e-02 -3.228328e-02 -5.670149e-03
## 197 199 202 204 205
## -1.332079e-02 4.459319e-02 -1.186962e-01 -1.175432e-02 -1.455986e-02
## 211 215 216 217 219
## 2.837017e-02 5.559464e-02 8.585500e-03 -4.838143e-02 -2.125292e-02
## 220 223 225 229 230
## 1.755865e-02 -8.279360e-02 -1.008769e-02 -1.059414e-02 -6.094955e-02
## 237 240 245 246 247
## 1.933845e-02 2.071210e-02 -7.923031e-03 -4.893567e-02 3.895553e-02
## 250 251 252
## 1.222447e-02 1.614346e-02 -2.721047e-02
resi<-round(valid$IBM_Change[1:10] - pred[1:10], 4) #Setting up the residual equation
pred_err<-data.frame("Predicted"=round(pred[1:10], 4), "Actual"=round(valid$IBM_Change[1:10],4),"Residual"=resi) #Creating the data.frame to see the prediction error
pred_err
## Predicted Actual Residual
## 1 -0.0300 -0.4062 -0.3763
## 3 0.0060 -0.2188 -0.2247
## 8 0.0422 -0.3125 -0.3547
## 9 0.0358 0.0938 0.0580
## 10 0.0257 0.0000 -0.0257
## 11 0.0139 0.1562 0.1424
## 14 0.0353 -0.1562 -0.1916
## 17 -0.0256 -0.9688 -0.9432
## 18 -0.0863 0.1250 0.2113
## 19 0.0011 0.0625 0.0614
library(forecast)
accuracy(pred,valid$IBM_Change)
## ME RMSE MAE MPE MAPE
## Test set -0.02154245 0.2313934 0.1708795 NaN Inf
bregression<- step(multi_regression, direction = "backward")
## Start: AIC=-446.92
## IBM_Change ~ DJI_Change + NAS_Change + SP_Change
##
## Df Sum of Sq RSS AIC
## - SP_Change 1 0.000703 5.4321 -448.90
## - DJI_Change 1 0.019299 5.4507 -448.43
## - NAS_Change 1 0.055725 5.4871 -447.49
## <none> 5.4314 -446.92
##
## Step: AIC=-448.9
## IBM_Change ~ DJI_Change + NAS_Change
##
## Df Sum of Sq RSS AIC
## <none> 5.4321 -448.90
## - NAS_Change 1 0.083511 5.5156 -448.77
## - DJI_Change 1 0.096406 5.5285 -448.44
#step()function was used to perform variable selection
library(forecast)
pred2<-predict(bregression,valid)
accuracy(pred2,valid$IBM_Change)
## ME RMSE MAE MPE MAPE
## Test set -0.02144154 0.2312406 0.1708243 NaN Inf
With backwards elimination, we start with all variables and remove one at a time untile the result is statisfied. One way we can find the best model is to look at the AIC value (the lower the better). In the frist phase, the AIC was -446.92 with all three variables. Under the AIC column, we can see the AIC value changes if we drop any variable. For instance, if we drop NAS_Change, the AIC will decrease to -447.49 with the model IBM_Change ~ DJI_Change+SP_Change. Therefore, we know dropping NAS_Change will give us the a better model (IBM_Change ~ DJI_Change + SP_Change) than the original model (IBM_Change ~ DJI_Change + NAS_Change + SP_Change).
From the second phase, we can see if we drop SP_Change, the AIC for the model ,IBM_Change ~ DJI_Change + NAS_Change would decrease to -448.9, which was the smallest AIC value. Using backwards regression would build a better model; however, the inprovemnet from the original model was not significant (-448.9 vs. -446.92).
We can also use the accuracy fuction on our new model, IBM_Change ~ DJI_Change + SP_Change to see if we could build a better model or not.
str(c1)
## 'data.frame': 85 obs. of 13 variables:
## $ competitorname : Factor w/ 85 levels "100 Grand","3 Musketeers",..: 1 2 45 46 3 4 5 6 7 8 ...
## $ chocolate : int 1 1 0 0 0 1 1 0 0 0 ...
## $ fruity : int 0 0 0 0 1 0 0 0 0 1 ...
## $ caramel : int 1 0 0 0 0 0 1 0 0 1 ...
## $ peanutyalmondy : int 0 0 0 0 0 1 1 1 0 0 ...
## $ nougat : int 0 1 0 0 0 0 1 0 0 0 ...
## $ crispedricewafer: int 1 0 0 0 0 0 0 0 0 0 ...
## $ hard : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bar : int 1 1 0 0 0 1 1 0 0 0 ...
## $ pluribus : int 0 0 0 0 0 0 0 1 1 0 ...
## $ sugarpercent : num 0.732 0.604 0.011 0.011 0.906 ...
## $ pricepercent : num 0.86 0.511 0.116 0.511 0.511 ...
## $ winpercent : num 67 67.6 32.3 46.1 52.3 ...
head(c1,10)
## competitorname chocolate fruity caramel peanutyalmondy nougat
## 1 100 Grand 1 0 1 0 0
## 2 3 Musketeers 1 0 0 0 1
## 3 One dime 0 0 0 0 0
## 4 One quarter 0 0 0 0 0
## 5 Air Heads 0 1 0 0 0
## 6 Almond Joy 1 0 0 1 0
## 7 Baby Ruth 1 0 1 1 1
## 8 Boston Baked Beans 0 0 0 1 0
## 9 Candy Corn 0 0 0 0 0
## 10 Caramel Apple Pops 0 1 1 0 0
## crispedricewafer hard bar pluribus sugarpercent pricepercent winpercent
## 1 1 0 1 0 0.732 0.860 66.97173
## 2 0 0 1 0 0.604 0.511 67.60294
## 3 0 0 0 0 0.011 0.116 32.26109
## 4 0 0 0 0 0.011 0.511 46.11650
## 5 0 0 0 0 0.906 0.511 52.34146
## 6 0 0 1 0 0.465 0.767 50.34755
## 7 0 0 1 0 0.604 0.767 56.91455
## 8 0 0 0 1 0.313 0.511 23.41782
## 9 0 0 0 1 0.906 0.325 38.01096
## 10 0 0 0 0 0.604 0.325 34.51768
c1$winpercent<-ifelse(c1$winpercent >=50,"winner","loser") #Labeling winner and loser
c1<-c1[ ,-12]
#Dropping pricepercent column in data c1
Since there are not many candies that dog onwers can share with their beloved dogs, I would invent a candy that is dog-friendly. The new candy I come up with is a bacon flavored dog-friendly candy. The candy is called Happy Paws. Happy Paws and the packaging will look like something similar in the picture.
happypaws<-data.frame("competitorname"="HappyPaws","chocolate"=0,"fruity"=0,
"caramel"=0,"peanutyalmondy"=0,"nougat"=1,"crispedricewafer"=0,"hard"=1,"bar"=0,
"pluribus"=1,"sugarpercent"=.001)
#Creating a the new data for Happy Paws
happypaws
## competitorname chocolate fruity caramel peanutyalmondy nougat
## 1 HappyPaws 0 0 0 0 1
## crispedricewafer hard bar pluribus sugarpercent
## 1 0 1 0 1 0.001
set.seed(50)
row_candy<-sample(1:nrow(c1), 0.6*nrow(c1)) #Seperating the candy data into 2 sets
train2<-c1[row_candy, ] #60% of data is training set
valid2<-c1[-row_candy, ] #40% of data is validation set
train.norm.2<-train2
valid.norm.2<-valid2
library(caret)
norm.values<-preProcess(train2[ ,c(2:11)], method=c("center","scale"))
train.norm.2[ ,c(2:11)]<-predict(norm.values,train2[ ,c(2:11)])
valid.norm.2[ ,c(2:11)]<-predict(norm.values,valid2[ ,c(2:11)])
happypaws.norm <- predict(norm.values,happypaws[ ,c(2:11)])
library(FNN)
k1<-knn(train.norm.2[ ,c(2:11)],happypaws.norm,cl=train.norm.2[ ,12],k=3)
row.names(train2)[attr(k1,"knn.index")]
## character(0)
k1
## [1] loser
## attr(,"nn.index")
## [,1] [,2] [,3]
## [1,] 49 17 13
## attr(,"nn.dist")
## [,1] [,2] [,3]
## [1,] 4.670527 4.956422 5.212694
## Levels: loser
Unfortunately, Happy Paws is a loser.
knnresults<-data.frame(c1[c(49,17,13), ])
knnresults
## competitorname chocolate fruity caramel peanutyalmondy nougat
## 49 Pixie Sticks 0 0 0 0 0
## 17 Fun Dip 0 1 0 0 0
## 13 Chiclets 0 1 0 0 0
## crispedricewafer hard bar pluribus sugarpercent winpercent
## 49 0 0 0 1 0.093 loser
## 17 0 1 0 0 0.732 loser
## 13 0 0 0 1 0.046 loser
library(caret)
library(FNN)
valid.norm.2$winpercent<-as.factor(valid.norm.2$winpercent)
#Changing winpercet to factor; therefore, knn.pre and valid.norm.2's data frames would match
accuracy2 <- data.frame(k = seq(1,10,1), accuracy = rep(0, 10))
# creating a data frame with 2 columns, k and accuracy.
for (i in 1:10){
knn.pred <-knn(train.norm.2[ ,c(2:11)], valid.norm.2[ ,c(2:11)],cl=train.norm.2[ ,12], k=i)
accuracy2[i,2]<-confusionMatrix(knn.pred ,valid.norm.2[ ,12])$overall[1]
}
#computing knn for different k on validation
accuracy2
## k accuracy
## 1 1 0.6764706
## 2 2 0.6764706
## 3 3 0.7058824
## 4 4 0.6764706
## 5 5 0.7058824
## 6 6 0.6764706
## 7 7 0.7058824
## 8 8 0.7058824
## 9 9 0.7647059
## 10 10 0.7058824
library(ggplot2)
ggplot(accuracy2,aes(x=k,y=accuracy,fill=k))+theme_bw()+geom_bar(stat = "identity",fill="light green")+scale_x_continuous(breaks=1:10)+geom_text(aes(label=round(accuracy,digits=5)),angle=50,size=3)+labs(y="Accuracy Level",x="k Level(Number of neighbors)",title="Accuracies among Different Neighbors")
From the chart, we know that k=9 had the best accuracy.
library(FNN)
k9<-knn(train.norm.2[ ,c(2:11)],happypaws.norm,cl=train.norm.2[ ,12],k=9)
row.names(train.norm.2)[attr(k9,"k9.index")]
## character(0)
#Using k=9
k9
## [1] loser
## attr(,"nn.index")
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 49 17 13 12 45 6 22 32 23
## attr(,"nn.dist")
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 4.670527 4.956422 5.212694 5.217029 5.328305 5.328305 5.336405
## [,8] [,9]
## [1,] 5.363862 5.379986
## Levels: loser
knnresults2<-data.frame(c1[c(49,17,13,12,45,6,22,32,23), ])
knnresults2
## competitorname chocolate fruity caramel peanutyalmondy
## 49 Pixie Sticks 0 0 0 0
## 17 Fun Dip 0 1 0 0
## 13 Chiclets 0 1 0 0
## 12 Chewey Lemonhead Fruit Mix 0 1 0 0
## 45 Nik L Nip 0 1 0 0
## 6 Almond Joy 1 0 0 1
## 22 Haribo Twin Snakes 0 1 0 0
## 32 Lifesavers big ring gummies 0 1 0 0
## 23 HersheyĆs Kisses 1 0 0 0
## nougat crispedricewafer hard bar pluribus sugarpercent winpercent
## 49 0 0 0 0 1 0.093 loser
## 17 0 0 1 0 0 0.732 loser
## 13 0 0 0 0 1 0.046 loser
## 12 0 0 0 0 1 0.732 loser
## 45 0 0 0 0 1 0.197 loser
## 6 0 0 0 1 0 0.465 winner
## 22 0 0 0 0 1 0.465 loser
## 32 0 0 0 0 0 0.267 winner
## 23 0 0 0 0 1 0.127 winner