#install.packages("ggplot2")
require(ggplot2)
## Loading required package: ggplot2
#Loads the data from URL. Credited data creator in github description. Basic # of rows and Columns. This is S&P Stock data
urldata <- read.csv("https://raw.githubusercontent.com/jrovalino/RWk3FinalProject/master/Smarket.csv", header = TRUE, sep = ",")
#urldata
nrow(urldata)
## [1] 1250
ncol(urldata)
## [1] 10
str(urldata)
## 'data.frame': 1250 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
#Getting stats on the data and performing data exploration:
summary(urldata)
## X Year Lag1 Lag2
## Min. : 1.0 Min. :2001 Min. :-4.922000 Min. :-4.922000
## 1st Qu.: 313.2 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500
## Median : 625.5 Median :2003 Median : 0.039000 Median : 0.039000
## Mean : 625.5 Mean :2003 Mean : 0.003834 Mean : 0.003919
## 3rd Qu.: 937.8 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :1250.0 Max. :2005 Max. : 5.733000 Max. : 5.733000
## Lag3 Lag4 Lag5
## Min. :-4.922000 Min. :-4.922000 Min. :-4.92200
## 1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000
## Median : 0.038500 Median : 0.038500 Median : 0.03850
## Mean : 0.001716 Mean : 0.001636 Mean : 0.00561
## 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700
## Max. : 5.733000 Max. : 5.733000 Max. : 5.73300
## Volume Today Direction
## Min. :0.3561 Min. :-4.922000 Down:602
## 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
## Median :1.4229 Median : 0.038500
## Mean :1.4783 Mean : 0.003138
## 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. :3.1525 Max. : 5.733000
#Mean and Median for "Volumes" vector in the data.frame urldata for Volume Column
YrsofData <-unique(urldata$Year)
YrsofData
## [1] 2001 2002 2003 2004 2005
data2001 <- subset(urldata, Year == 2001)
#data2001
#2001 Mean and Median
Vol2001MeanRow <- mean((data2001[["Volume"]]))
Vol2001MeanRow
## [1] 1.22695
Vol2001MedianRow<-median(c(data2001[,8]))
Vol2001MedianRow
## [1] 1.2174
data2002 <- subset(urldata, Year == 2002)
#data2002
#2002 Mean and Median
Vol2002MeanRow <- mean((data2002[["Volume"]]))
Vol2002MeanRow
## [1] 1.428451
Vol2002MedianRow<-median(c(data2002[,8]))
Vol2002MedianRow
## [1] 1.3786
data2003 <- subset(urldata, Year == 2003)
#data2003
#2003 Mean and Median
Vol2003MeanRow <- mean((data2003[["Volume"]]))
Vol2003MeanRow
## [1] 1.384693
Vol2003MedianRow<-median(c(data2003[,8]))
Vol2003MedianRow
## [1] 1.4
data2004 <- subset(urldata, Year == 2004)
#data2004
#2004 Mean and Median
Vol2004MeanRow <- mean((data2004[["Volume"]]))
Vol2004MeanRow
## [1] 1.424159
Vol2004MedianRow<-median(c(data2004[,8]))
Vol2004MedianRow
## [1] 1.4227
data2005 <- subset(urldata, Year == 2005)
#data2005
#2005 Mean and Median
Vol2005MeanRow <- mean((data2005[["Volume"]]))
Vol2005MeanRow
## [1] 1.917298
Vol2005MedianRow<-median(c(data2005[,8]))
Vol2005MedianRow
## [1] 1.896905
#Total Mean and Median
TotVolMeanRow <- mean((urldata[["Volume"]]))
TotVolMeanRow
## [1] 1.478305
TotVolMedianRow<-median(c(urldata[,8]))
TotVolMedianRow
## [1] 1.42295
Perc2005Better<-(Vol2005MeanRow/TotVolMeanRow)
Perc2005Better
## [1] 1.296957
Perc2001Worse<-1-(Vol2001MeanRow/TotVolMeanRow)
Perc2001Worse
## [1] 0.1700295
#Volume Analysis: The Volume mean across the years(2001-2005 respectively): 1.22695, 1.428451, 1.384693, 1.424159, 1.917298 Total Volume mean: 1.478305 The volume mean in 2005 was the highest and 29% higher than the total mean (2001-2005). The volume mean in 2001 was the lowest and 17% lower than the total mean (2001-2005).
colnames(urldata)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(urldata)
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 1 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
tail(urldata)
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 1245 1245 2005 0.252 -0.024 -0.584 -0.285 -0.141 2.06517 0.422 Up
## 1246 1246 2005 0.422 0.252 -0.024 -0.584 -0.285 1.88850 0.043 Up
## 1247 1247 2005 0.043 0.422 0.252 -0.024 -0.584 1.28581 -0.955 Down
## 1248 1248 2005 -0.955 0.043 0.422 0.252 -0.024 1.54047 0.130 Up
## 1249 1249 2005 0.130 -0.955 0.043 0.422 0.252 1.42236 -0.298 Down
## 1250 1250 2005 -0.298 0.130 -0.955 0.043 0.422 1.38254 -0.489 Down
Change to columns name from Lag# columns to more meaningful name that represents the percent return for # Day prior. ex: %1D.
Example of data wrangling is also demonstrated by the creation of 5 subsets by year for new dataframes used above and also renaming the lag fields.
colnames(data2001)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2001)
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 1 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
colnames(data2002)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2002)
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 243 243 2002 -1.115 0.336 0.675 0.412 -0.021 0.9436 0.574 Up
## 244 244 2002 0.574 -1.115 0.336 0.675 0.412 1.1710 0.918 Up
## 245 245 2002 0.918 0.574 -1.115 0.336 0.675 1.3989 0.621 Up
## 246 246 2002 0.621 0.918 0.574 -1.115 0.336 1.5130 -0.650 Down
## 247 247 2002 -0.650 0.621 0.918 0.574 -1.115 1.3083 -0.359 Down
## 248 248 2002 -0.359 -0.650 0.621 0.918 0.574 1.2588 -0.480 Down
colnames(data2003)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2003)
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 495 495 2003 0.049 0.456 -1.603 -0.315 -0.547 1.0885 3.320 Up
## 496 496 2003 3.320 0.049 0.456 -1.603 -0.315 1.2292 -0.048 Down
## 497 497 2003 -0.048 3.320 0.049 0.456 -1.603 1.1308 2.247 Up
## 498 498 2003 2.247 -0.048 3.320 0.049 0.456 1.4359 -0.654 Down
## 499 499 2003 -0.654 2.247 -0.048 3.320 0.049 1.5452 -1.409 Down
## 500 500 2003 -1.409 -0.654 2.247 -0.048 3.320 1.4676 1.939 Up
colnames(data2004)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2004)
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 747 747 2004 0.205 0.014 1.240 0.169 -0.181 1.0275 -0.309 Down
## 748 748 2004 -0.309 0.205 0.014 1.240 0.169 1.1532 1.240 Up
## 749 749 2004 1.240 -0.309 0.205 0.014 1.240 1.5782 0.129 Up
## 750 750 2004 0.129 1.240 -0.309 0.205 0.014 1.4945 0.237 Up
## 751 751 2004 0.237 0.129 1.240 -0.309 0.205 1.7049 0.496 Up
## 752 752 2004 0.496 0.237 0.129 1.240 -0.309 1.8684 -0.889 Down
colnames(data2005)<-c("X","Year","%1D", "%2D","%3D", "%4D", "%5D","Vol","%Today","Dir")
head(data2005)
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 999 999 2005 -0.134 0.008 -0.007 0.715 -0.431 0.7869 -0.812 Down
## 1000 1000 2005 -0.812 -0.134 0.008 -0.007 0.715 1.5108 -1.167 Down
## 1001 1001 2005 -1.167 -0.812 -0.134 0.008 -0.007 1.7210 -0.363 Down
## 1002 1002 2005 -0.363 -1.167 -0.812 -0.134 0.008 1.7389 0.351 Up
## 1003 1003 2005 0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143 Down
## 1004 1004 2005 -0.143 0.351 -0.363 -1.167 -0.812 1.4779 0.342 Up
boxplot(data2001$'%Today')
hist(data2001$'%Today')
qplot(Vol, '%Today', data=data2001)
boxplot(data2002$'%Today')
hist(data2002$'%Today')
qplot(Vol, '%Today', data=data2002)
boxplot(data2003$'%Today')
hist(data2003$'%Today')
qplot(Vol, '%Today', data=data2003)
boxplot(data2004$'%Today')
hist(data2004$'%Today')
qplot(Vol, '%Today', data=data2004)
boxplot(data2005$'%Today')
hist(data2005$'%Today')
qplot(Vol, '%Today', data=data2005)
boxplot(urldata$'%Today')
hist(urldata$'%Today')
qplot(Vol, '%Today', data=urldata)
What was the largest Percent day change- Up and Down? Which year did it occur? Visually display %today? (Hint graph)
str(urldata)
## 'data.frame': 1250 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
## $ %1D : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ %2D : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ %3D : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ %4D : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ %5D : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Vol : num 1.19 1.3 1.41 1.28 1.21 ...
## $ %Today: num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Dir : Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
urldata[which.max(urldata$'%Today'),]
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 383 383 2002 -2.702 -3.291 -3.835 -2.702 0.566 2.44102 5.733 Up
urldata[which.min(urldata$'%Today'),]
## X Year %1D %2D %3D %4D %5D Vol %Today Dir
## 169 169 2001 0.623 -1.864 -2.239 -0.106 -0.056 1.2766 -4.922 Down
#urldata
hist(urldata$'%Today')
Answer:
The largest up % change was 5.733 in 2002
The largest down % was -4.922 in 2001.
Answer: Please see beginning of code - it is pulling from github using: urldata <- read.csv(“https://raw.githubusercontent.com/jrovalino/RWk3FinalProject/master/Smarket.csv”, header = TRUE, sep = “,”)