Get web data

library(RCurl)
v = readLines("http://archive.ics.uci.edu/ml/machine-learning-databases/bridges/bridges.data.version2") #start from weblink
summary(v) #character vector of length 108
##    Length     Class      Mode 
##       108 character character

Convert webpage to dataframe

v<-strsplit(v,",") #split into list
df<-data.frame(matrix(unlist(v), nrow=108, byrow=T)) #unlist and convert to dataframe
tail(df)
##       X1 X2 X3     X4      X5    X6 X7 X8      X9   X10    X11 X12    X13
## 103  E85  M  9 MODERN HIGHWAY  LONG  4  G    DECK STEEL   LONG   F CONT-T
## 104  E84  A 24 MODERN HIGHWAY SHORT  6  G THROUGH STEEL MEDIUM   F   ARCH
## 105  E91  O 44 MODERN HIGHWAY  LONG  6  G THROUGH STEEL   LONG   F   ARCH
## 106  E90  M  7 MODERN HIGHWAY SHORT  6  G THROUGH STEEL   LONG   F   ARCH
## 107 E100  O 43 MODERN HIGHWAY     ?  ?  G       ?     ?      ?   F      ?
## 108 E109  A 28 MODERN HIGHWAY     ?  ?  G       ?     ?      ?   F      ?

Subset columns of interest and assign descriptive names

br<-subset(df[,c(1:3,7:11)])
colnames(br)<-c("id","river","location","lanes","clear_g","t_d","material","span")
head(br)
##   id river location lanes clear_g     t_d material   span
## 1 E1     M        3     2       N THROUGH     WOOD  SHORT
## 2 E2     A       25     2       N THROUGH     WOOD  SHORT
## 3 E3     A       39     1       N THROUGH     WOOD      ?
## 4 E5     A       29     2       N THROUGH     WOOD  SHORT
## 5 E6     M       23     2       N THROUGH     WOOD      ?
## 6 E7     A       27     2       N THROUGH     WOOD MEDIUM

Some basic data cleasing

br[br == '?']<-NA #set missing values to NA
levels(br$span)[levels(br$span)=="?"]<-"missing" #rename a factor level
str(br$span)
##  Factor w/ 4 levels "missing","LONG",..: 4 4 NA 4 NA 3 4 4 NA 3 ...
br$lanes<-as.numeric(br$lanes) #prepare numbers for analysis
summary(br$lanes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000   3.000   3.000   3.293   4.000   5.000      16

Get rows of interest: Bridges with long spans and many lanes

bigbr <- subset(br, br$span == "LONG" & br$lanes >= 4)
bigbr
##      id river location lanes clear_g     t_d material span
## 67  E60     A       24     4       G THROUGH    STEEL LONG
## 78  E67     M        1     4       G THROUGH    STEEL LONG
## 83  E78     O       40     4       G THROUGH    STEEL LONG
## 84  E77     O       42     4       N THROUGH    STEEL LONG
## 85  E76     M        6     4       G THROUGH    STEEL LONG
## 86  E93     M       11     4       N    DECK    STEEL LONG
## 96  E81     M       14     4       G    DECK    STEEL LONG
## 97  E80     M       19     4       G THROUGH    STEEL LONG
## 98  E88     A       37     4       N    DECK    STEEL LONG
## 101 E83     M        1     5       G THROUGH    STEEL LONG
## 103 E85     M        9     4       G    DECK    STEEL LONG
## 105 E91     O       44     5       G THROUGH    STEEL LONG
## 106 E90     M        7     5       G THROUGH    STEEL LONG