library(Hmisc)
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
data(mtcars)
head(mtcars,10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
tail(mtcars,5)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.5 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.5 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.6 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.6 1 1 4 2
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
mtcars[1,]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4
mtcars[,2]
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
mtcars[2,3]
## [1] 160
mtcars$cyl
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
row.names(mtcars)
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
## [7] "Duster 360" "Merc 240D" "Merc 230"
## [10] "Merc 280" "Merc 280C" "Merc 450SE"
## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
## [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [31] "Maserati Bora" "Volvo 142E"
row.names(iris)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11"
## [12] "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22"
## [23] "23" "24" "25" "26" "27" "28" "29" "30" "31" "32" "33"
## [34] "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44"
## [45] "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55"
## [56] "56" "57" "58" "59" "60" "61" "62" "63" "64" "65" "66"
## [67] "67" "68" "69" "70" "71" "72" "73" "74" "75" "76" "77"
## [78] "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88"
## [89] "89" "90" "91" "92" "93" "94" "95" "96" "97" "98" "99"
## [100] "100" "101" "102" "103" "104" "105" "106" "107" "108" "109" "110"
## [111] "111" "112" "113" "114" "115" "116" "117" "118" "119" "120" "121"
## [122] "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143"
## [144] "144" "145" "146" "147" "148" "149" "150"
mtcars$brand=row.names(mtcars)
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb brand
## Min. :0.0000 Min. :3.000 Min. :1.000 Length:32
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000 Class :character
## Median :0.0000 Median :4.000 Median :2.000 Mode :character
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
describe(mtcars)
## mtcars
##
## 12 Variables 32 Observations
## ---------------------------------------------------------------------------
## mpg
## n missing unique Info Mean .05 .10 .25 .50
## 32 0 25 1 20.09 12.00 14.34 15.43 19.20
## .75 .90 .95
## 22.80 30.09 31.30
##
## lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
## ---------------------------------------------------------------------------
## cyl
## n missing unique Info Mean
## 32 0 3 0.87 6.188
##
## 4 (11, 34%), 6 (7, 22%), 8 (14, 44%)
## ---------------------------------------------------------------------------
## disp
## n missing unique Info Mean .05 .10 .25 .50
## 32 0 27 1 230.7 77.35 80.61 120.83 196.30
## .75 .90 .95
## 326.00 396.00 449.00
##
## lowest : 71.1 75.7 78.7 79.0 95.1
## highest: 360.0 400.0 440.0 460.0 472.0
## ---------------------------------------------------------------------------
## hp
## n missing unique Info Mean .05 .10 .25 .50
## 32 0 22 1 146.7 63.65 66.00 96.50 123.00
## .75 .90 .95
## 180.00 243.50 253.55
##
## lowest : 52 62 65 66 91, highest: 215 230 245 264 335
## ---------------------------------------------------------------------------
## drat
## n missing unique Info Mean .05 .10 .25 .50
## 32 0 22 1 3.597 2.853 3.007 3.080 3.695
## .75 .90 .95
## 3.920 4.209 4.314
##
## lowest : 2.76 2.93 3.00 3.07 3.08, highest: 4.08 4.11 4.22 4.43 4.93
## ---------------------------------------------------------------------------
## wt
## n missing unique Info Mean .05 .10 .25 .50
## 32 0 29 1 3.217 1.736 1.956 2.581 3.325
## .75 .90 .95
## 3.610 4.048 5.293
##
## lowest : 1.513 1.615 1.835 1.935 2.140
## highest: 3.845 4.070 5.250 5.345 5.424
## ---------------------------------------------------------------------------
## qsec
## n missing unique Info Mean .05 .10 .25 .50
## 32 0 30 1 17.85 15.05 15.53 16.89 17.71
## .75 .90 .95
## 18.90 19.99 20.10
##
## lowest : 14.50 14.60 15.41 15.50 15.84
## highest: 19.90 20.00 20.01 20.22 22.90
## ---------------------------------------------------------------------------
## vs
## n missing unique Info Sum Mean
## 32 0 2 0.74 14 0.4375
## ---------------------------------------------------------------------------
## am
## n missing unique Info Sum Mean
## 32 0 2 0.72 13 0.4062
## ---------------------------------------------------------------------------
## gear
## n missing unique Info Mean
## 32 0 3 0.84 3.688
##
## 3 (15, 47%), 4 (12, 38%), 5 (5, 16%)
## ---------------------------------------------------------------------------
## carb
## n missing unique Info Mean
## 32 0 6 0.93 2.812
##
## 1 2 3 4 6 8
## Frequency 7 10 3 10 1 1
## % 22 31 9 31 3 3
## ---------------------------------------------------------------------------
## brand
## n missing unique
## 32 0 32
##
## lowest : AMC Javelin Cadillac Fleetwood Camaro Z28 Chrysler Imperial Datsun 710
## highest: Porsche 914-2 Toyota Corolla Toyota Corona Valiant Volvo 142E
## ---------------------------------------------------------------------------
#Q What does describe do as compared to summary
#editDataset(mtcars)
mtcars2=mtcars
mtcars2[3,1:6]=NA
mtcars2[3:9,2]=NA
mean(mtcars2$cyl, na.rm=T)
## [1] 6.32
#na.rm ignores missing values
is.na(mtcars2$cyl) #this is true only when missing values are there
## [1] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
str(mtcars)
## 'data.frame': 32 obs. of 12 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ brand: chr "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
mtcars$cyl
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
#Task is replace all values which are missing in mtcars with mean /median and create a new dataset # But do it only using code
#HINT - You will use ifelse , na.rm and is.na functions #HINT - DO YOU NEED MORE HINTS ?
mtcars3=na.omit(mtcars)
mtcars2$cyl21=
ifelse(is.na(mtcars$cyl),
" ",
mtcars$cyl)
#?ifelse
head(mtcars2,9)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 NA NA NA NA NA NA 18.61 1 1 4 1
## Hornet 4 Drive 21.4 NA 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 NA 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 NA 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 NA 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 NA 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 NA 140.8 95 3.92 3.150 22.90 1 0 4 2
## brand cyl21
## Mazda RX4 Mazda RX4 6
## Mazda RX4 Wag Mazda RX4 Wag 6
## Datsun 710 Datsun 710 4
## Hornet 4 Drive Hornet 4 Drive 6
## Hornet Sportabout Hornet Sportabout 8
## Valiant Valiant 6
## Duster 360 Duster 360 8
## Merc 240D Merc 240D 4
## Merc 230 Merc 230 4
str(mtcars2)
## 'data.frame': 32 obs. of 13 variables:
## $ mpg : num 21 21 NA 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 NA NA NA NA NA NA NA 6 ...
## $ disp : num 160 160 NA 258 360 ...
## $ hp : num 110 110 NA 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 NA 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 NA 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ brand: chr "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ cyl21: num 6 6 4 6 8 6 8 4 4 6 ...
mtcars2$cyl2=
ifelse(is.na(mtcars2$cyl),
median(mtcars2$cyl,na.rm=T),
mtcars2$cyl)
table(mtcars$cyl2)
## < table of extent 0 >
mtcars3=na.omit(mtcars2)
cor(mtcars3[1:11])
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8758400 -0.8556910 -0.77961838 0.7127164 -0.8819429
## cyl -0.8758400 1.0000000 0.8985035 0.81853826 -0.7665601 0.8324075
## disp -0.8556910 0.8985035 1.0000000 0.78859201 -0.7365131 0.9164049
## hp -0.7796184 0.8185383 0.7885920 1.00000000 -0.5203642 0.6966594
## drat 0.7127164 -0.7665601 -0.7365131 -0.52036417 1.0000000 -0.7533969
## wt -0.8819429 0.8324075 0.9164049 0.69665941 -0.7533969 1.0000000
## qsec 0.4441360 -0.5350648 -0.4116137 -0.71443266 0.1481272 -0.2072452
## vs 0.6958696 -0.7992424 -0.7185176 -0.67406989 0.5892788 -0.6070036
## am 0.6679774 -0.6295948 -0.6355251 -0.31649083 0.7291590 -0.7232991
## gear 0.4679080 -0.4788756 -0.5271718 -0.08737484 0.6541172 -0.5972689
## carb -0.5724860 0.5232139 0.3982365 0.74859820 -0.2007564 0.4467421
## qsec vs am gear carb
## mpg 0.4441360 0.6958696 0.667977355 0.46790795 -0.572486018
## cyl -0.5350648 -0.7992424 -0.629594787 -0.47887557 0.523213913
## disp -0.4116137 -0.7185176 -0.635525058 -0.52717181 0.398236524
## hp -0.7144327 -0.6740699 -0.316490831 -0.08737484 0.748598205
## drat 0.1481272 0.5892788 0.729159008 0.65411720 -0.200756408
## wt -0.2072452 -0.6070036 -0.723299122 -0.59726894 0.446742146
## qsec 1.0000000 0.7005019 -0.152673790 -0.35401014 -0.685801376
## vs 0.7005019 1.0000000 0.280224269 0.23586409 -0.499125907
## am -0.1526738 0.2802243 1.000000000 0.82671008 0.001973435
## gear -0.3540101 0.2358641 0.826710084 1.00000000 0.273886265
## carb -0.6858014 -0.4991259 0.001973435 0.27388627 1.000000000
mean(mtcars2$mpg,na.rm=T)
## [1] 20.00323
summarize(mtcars2$mpg,mtcars2$cyl,mean)
## mtcars2$cyl mtcars2$mpg
## 1 4 27.91250
## 2 6 19.74000
## 3 8 14.86667
## 4 NA NA
head(mtcars2)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 NA NA NA NA NA NA 18.61 1 1 4 1
## Hornet 4 Drive 21.4 NA 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 NA 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 NA 225 105 2.76 3.460 20.22 1 0 3 1
## brand cyl21 cyl2
## Mazda RX4 Mazda RX4 6 6
## Mazda RX4 Wag Mazda RX4 Wag 6 6
## Datsun 710 Datsun 710 4 6
## Hornet 4 Drive Hornet 4 Drive 6 6
## Hornet Sportabout Hornet Sportabout 8 6
## Valiant Valiant 6 6
head(mtcars[1:3],9)
## mpg cyl disp
## Mazda RX4 21.0 6 160.0
## Mazda RX4 Wag 21.0 6 160.0
## Datsun 710 22.8 4 108.0
## Hornet 4 Drive 21.4 6 258.0
## Hornet Sportabout 18.7 8 360.0
## Valiant 18.1 6 225.0
## Duster 360 14.3 8 360.0
## Merc 240D 24.4 4 146.7
## Merc 230 22.8 4 140.8
head(mtcars2[1:3],9)
## mpg cyl disp
## Mazda RX4 21.0 6 160.0
## Mazda RX4 Wag 21.0 6 160.0
## Datsun 710 NA NA NA
## Hornet 4 Drive 21.4 NA 258.0
## Hornet Sportabout 18.7 NA 360.0
## Valiant 18.1 NA 225.0
## Duster 360 14.3 NA 360.0
## Merc 240D 24.4 NA 146.7
## Merc 230 22.8 NA 140.8
mtcars3=na.omit(mtcars2)
summarize(mtcars3$disp,mtcars3$cyl,mean)
## mtcars3$cyl mtcars3$disp
## 1 4 95.125
## 2 6 160.040
## 3 8 351.950
data(Boston,package="MASS")
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
Boston[2:40,14]=NA
summary(Boston$medv)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.00 17.20 21.50 22.75 25.15 50.00 39
#write.csv(Boston,file="Boston.csv")
#Impute the missing values in Boston and show me the code
#getwd()