library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
First, load data into Rstudio from Github. I chose the Highest Winning Percents Entering College World Series. The ask of this data set is, “the data needs to be comma delimited, instead of space delimited to make it easier to work with the teams with two words in their names. The seasons wins and losses need to be broken into two columns, because they are different variable, the title for CWS W-L needs to also be two columns, a win and a loss column for the world series.” I will try to format some of the data as I enter it into R.
worldseries = read.csv("https://raw.githubusercontent.com/ntlrs/data607project2/master/rawworldseriesdata", skip = 3, header = F)
colnames(worldseries)[1] <- "Team"
colnames(worldseries)[2] <- "Year"
colnames(worldseries)[3] <- "W-L"
colnames(worldseries)[4] <- "Pct."
colnames(worldseries)[5] <- "World Series W-L"
colnames(worldseries)[6] <- "Finish"
worldseries
## Team Year W-L Pct. World Series W-L Finish
## 1 Penn St. 1957 19-0 1.000 3-2 (2nd)
## 2 Ithaca 1962 17-0 1.000 1-2 (5th)
## 3 Oklahoma St. 1955 24-1 0.960 3-2 (3rd)
## 4 Oklahoma St. 1961 24-1 0.960 3-2 (2nd)
## 5 UConn 1959 20-1 0.952 0-2 (7th)
## 6 Tennessee 1951 16-1 0.941 4-2 (2nd)
## 7 Arizona St. 1972 60-4 0.938 4-2 (2nd)
## 8 Utah 1951 15-1 0.938 2-2 (3rd)
## 9 Wake Forest 1949 29-2 0.935 2-2 (2nd)
## 10 Texas 1982 57-4 0.934 2-2 (3rd)
## 11 Tulsa 1969 36-3 0.923 3-2 (2nd)
## 12 Northern Colo. 1955 24-2 0.923 1-2 (5th)
## 13 South Carolina 1975 47-4 0.922 4-2 (2nd)
## 14 Harvard 1973 35-3 0.921 0-2 (7th)
## 15 Texas 1975 52-5 0.912 4-1 (1st)
## 16 Texas 1973 48-5 0.906 2-2 (3rd)
## 17 Arizona St. 1973 56-6 0.903 3-2 (2nd)
## 18 Texas 1979 53-6 0.898 2-2 (4th)
## 19 Texas 1974 52-6 0.897 2-2 (4th)
## 20 Arizona St. 1964 43-5 0.896 1-2 (5th)
Now I need to split the W-L columns and remove the brackets around the finishs
ws1 <- separate(worldseries, "W-L", c("Wins", "Loses"), sep = "-")
ws2 <- separate(ws1, "World Series W-L", c("WS Wins", "WS Loses"), sep = "-")
ws2
## Team Year Wins Loses Pct. WS Wins WS Loses Finish
## 1 Penn St. 1957 19 0 1.000 3 2 (2nd)
## 2 Ithaca 1962 17 0 1.000 1 2 (5th)
## 3 Oklahoma St. 1955 24 1 0.960 3 2 (3rd)
## 4 Oklahoma St. 1961 24 1 0.960 3 2 (2nd)
## 5 UConn 1959 20 1 0.952 0 2 (7th)
## 6 Tennessee 1951 16 1 0.941 4 2 (2nd)
## 7 Arizona St. 1972 60 4 0.938 4 2 (2nd)
## 8 Utah 1951 15 1 0.938 2 2 (3rd)
## 9 Wake Forest 1949 29 2 0.935 2 2 (2nd)
## 10 Texas 1982 57 4 0.934 2 2 (3rd)
## 11 Tulsa 1969 36 3 0.923 3 2 (2nd)
## 12 Northern Colo. 1955 24 2 0.923 1 2 (5th)
## 13 South Carolina 1975 47 4 0.922 4 2 (2nd)
## 14 Harvard 1973 35 3 0.921 0 2 (7th)
## 15 Texas 1975 52 5 0.912 4 1 (1st)
## 16 Texas 1973 48 5 0.906 2 2 (3rd)
## 17 Arizona St. 1973 56 6 0.903 3 2 (2nd)
## 18 Texas 1979 53 6 0.898 2 2 (4th)
## 19 Texas 1974 52 6 0.897 2 2 (4th)
## 20 Arizona St. 1964 43 5 0.896 1 2 (5th)
ws2$Finish <- gsub("[.(]", "",ws2$Finish)
ws2$Finish <- gsub("[.)]", "",ws2$Finish)
ws2
## Team Year Wins Loses Pct. WS Wins WS Loses Finish
## 1 Penn St. 1957 19 0 1.000 3 2 2nd
## 2 Ithaca 1962 17 0 1.000 1 2 5th
## 3 Oklahoma St. 1955 24 1 0.960 3 2 3rd
## 4 Oklahoma St. 1961 24 1 0.960 3 2 2nd
## 5 UConn 1959 20 1 0.952 0 2 7th
## 6 Tennessee 1951 16 1 0.941 4 2 2nd
## 7 Arizona St. 1972 60 4 0.938 4 2 2nd
## 8 Utah 1951 15 1 0.938 2 2 3rd
## 9 Wake Forest 1949 29 2 0.935 2 2 2nd
## 10 Texas 1982 57 4 0.934 2 2 3rd
## 11 Tulsa 1969 36 3 0.923 3 2 2nd
## 12 Northern Colo. 1955 24 2 0.923 1 2 5th
## 13 South Carolina 1975 47 4 0.922 4 2 2nd
## 14 Harvard 1973 35 3 0.921 0 2 7th
## 15 Texas 1975 52 5 0.912 4 1 1st
## 16 Texas 1973 48 5 0.906 2 2 3rd
## 17 Arizona St. 1973 56 6 0.903 3 2 2nd
## 18 Texas 1979 53 6 0.898 2 2 4th
## 19 Texas 1974 52 6 0.897 2 2 4th
## 20 Arizona St. 1964 43 5 0.896 1 2 5th
now, I’ll do a quick analysis of the data with some visualizations
ggplot(ws2, aes(Year, Team)) +
geom_point(aes(size = Pct., color = Pct.)) +
coord_cartesian() +
scale_color_gradient() +
theme_minimal()
By the look of the graph, both Penn St. and Ithaca had perfect seasons before going into the CWS.