Load data as data frame, inspect
bridges <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/bridges/bridges.data.version1"))
bridges <- as.data.frame(bridges)
names(bridges) = c("ID","River","State","Year_Built","Purpose","Length","Lanes","Clear","Deck_Type","Material","Span","Rel","Type")
summary(bridges)
## ID River State Year_Built Purpose
## E10 : 1 A:49 28 : 5 Min. :1819 AQUEDUCT: 4
## E100 : 1 M:40 39 : 5 1st Qu.:1884 HIGHWAY :70
## E101 : 1 O:15 25 : 4 Median :1903 RR :32
## E102 : 1 Y: 3 27 : 4 Mean :1906 WALK : 1
## E103 : 1 29 : 4 3rd Qu.:1928
## E105 : 1 1 : 3 Max. :1986
## (Other):101 (Other):82
## Length Lanes Clear Deck_Type Material Span Rel
## ? :26 ?:16 ?: 2 ? : 6 ? : 2 ? :16 ? : 5
## 1000 :10 1: 4 G:80 DECK :15 IRON :11 LONG :30 F :58
## 1200 : 5 2:60 N:25 THROUGH:86 STEEL:79 MEDIUM:53 S :29
## 1500 : 2 4:23 WOOD :15 SHORT : 8 S-F:15
## 2000 : 2 6: 4
## 2300 : 2
## (Other):60
## Type
## SIMPLE-T:44
## WOOD :15
## ARCH :13
## CANTILEV:11
## SUSPEN :11
## CONT-T :10
## (Other) : 3
head(bridges)
## ID River State Year_Built Purpose Length Lanes Clear Deck_Type Material
## 1 E2 A 25 1819 HIGHWAY 1037 2 N THROUGH WOOD
## 2 E3 A 39 1829 AQUEDUCT ? 1 N THROUGH WOOD
## 3 E5 A 29 1837 HIGHWAY 1000 2 N THROUGH WOOD
## 4 E6 M 23 1838 HIGHWAY ? 2 N THROUGH WOOD
## 5 E7 A 27 1840 HIGHWAY 990 2 N THROUGH WOOD
## 6 E8 A 28 1844 AQUEDUCT 1000 1 N THROUGH IRON
## Span Rel Type
## 1 SHORT S WOOD
## 2 ? S WOOD
## 3 SHORT S WOOD
## 4 ? S WOOD
## 5 MEDIUM S WOOD
## 6 SHORT S SUSPEN
Subset data usign subset function to select certain columns
subBridges <- subset(bridges, select=c(Lanes, State, Purpose, Material, Year_Built, Length))
subLongBridges <- subset(bridges, select=c(Lanes, State, Purpose, Material, Year_Built, Length), as.numeric(as.character(Length)) > 100)
## Warning in eval(e, x, parent.frame()): NAs introduced by coercion
subNEBridges <- subset(subBridges, State=="6"|State=="18"|State=="20"|State=="28"|State=="44"|State=="38")
By storing a basic aesthetic plot as an object, several variations can be plotted quickly with + [different geoms]
lengthPlot <-
ggplot(subLongBridges, aes(x=as.numeric(as.character(Year_Built)),y=as.numeric(as.character(Length))))+
labs(x="Year Built", y="Bridge Length", caption="Lab 1 Data 607, Alice Friedman")
lengthPlot+geom_jitter(alpha=0.5)
lengthPlot+geom_jitter(aes(col=Material))
lengthPlot+geom_jitter(aes(col=Purpose))
lengthPlot+geom_jitter(aes(col=Lanes))
By creating a function, the same plot type can be used on different tables.
#Function requires that all tables have column names Year_Built and Length
plot_bridges <- function(yourTable, tableTitle){
p <-
ggplot(data=yourTable, aes(x=as.numeric(as.character(Year_Built)),y=as.numeric(as.character(Length))))+
labs(x="Year Built", y="Bridge Length", caption="Lab 1 Data 607, Alice Friedman", title=tableTitle)
p + geom_jitter(alpha=0.5, aes(col=Material))
}
plot_bridges(subNEBridges,"Subset of Data for New England Bridges")
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning: Removed 2 rows containing missing values (geom_point).
plot_bridges(subLongBridges,"Subset of Bridges Longer than 1000'")
A more useful version of this would take the column names and labels as arguments, as well!