Manipulating Data in R
R Models
Spring 2020
Manipulating Data in R
R Models
R’s read.table() functions can handle a lot of things
http://cs.ucf.edu/~wiegand/idc6700/datasets/L8-knapsack-data.csv
x = read.table('http://cs.ucf.edu/~wiegand/idc6700/datasets/L8-knapsack-data.csv',
sep=',', # Use a comma as a field separator
header=FALSE, # Don't use the first line as the header
skip=10, # But skip the first 10 lines
col.names=c('ProblemInstance','Trial','BestSolution','ExtraComments'),
colClasses =c('character', 'integer', 'numeric', 'character'),
fill=TRUE, # Fill out remaining missing cols
blank.lines.skip=TRUE, # Skip blank lines
comment.char=";")
head(x)
## ProblemInstance Trial BestSolution ExtraComments ## 1 P1 0 13.55185 ## 2 P1 1 13.79003 ## 3 P1 3 14.68695 ## 4 P1 4 13.52434 annoying extra field ## 5 P1 5 13.55290 ## 6 P1 6 14.03507
colnames(ChickWeight)
## [1] "weight" "Time" "Chick" "Diet"
subset(ChickWeight, weight > 330, select=c(weight,Time))
## weight Time ## 232 331 21 ## 388 341 21 ## 398 332 18 ## 399 361 20 ## 400 373 21
head(transform(ChickWeight, weight=-weight))
## weight Time Chick Diet ## 1 -42 0 1 1 ## 2 -51 2 1 1 ## 3 -59 4 1 1 ## 4 -64 6 1 1 ## 5 -76 8 1 1 ## 6 -93 10 1 1
head(reorder(ChickWeight$Time, ChickWeight$weight))
## [1] 0 2 4 6 8 10 ## Levels: 0 2 4 6 8 10 12 14 16 18 20 21
head(transform(ChickWeight, reorder(Time, weight)))
## weight Time Chick Diet ## 1 42 0 1 1 ## 2 51 2 1 1 ## 3 59 4 1 1 ## 4 64 6 1 1 ## 5 76 8 1 1 ## 6 93 10 1 1
library(dplyr,quietly=TRUE, warn.conflicts=FALSE) head(arrange(ChickWeight, desc(Diet,Chick,Time,weight)))
## weight Time Chick Diet ## 1 42 0 41 4 ## 2 51 2 41 4 ## 3 66 4 41 4 ## 4 85 6 41 4 ## 5 103 8 41 4 ## 6 124 10 41 4
ChickWeight[ ChickWeight$Chick==1 & ChickWeight$Time > 10, ]
## weight Time Chick Diet ## 7 106 12 1 1 ## 8 125 14 1 1 ## 9 149 16 1 1 ## 10 171 18 1 1 ## 11 199 20 1 1 ## 12 205 21 1 1
ChickWeight[ c(3,5,7), 1:3 ]
## weight Time Chick ## 3 59 4 1 ## 5 76 8 1 ## 7 106 12 1
subset(aggregate(state.x77,list(Region = state.region), mean),
select=c(Region, Population, Income, Illiteracy))
## Region Population Income Illiteracy ## 1 Northeast 5495.111 4570.222 1.000000 ## 2 South 4208.125 4011.938 1.737500 ## 3 North Central 4803.000 4611.083 0.700000 ## 4 West 2915.308 4702.615 1.023077
A = data.frame(Var1 = c(1,2,3), Var2=factor(c("a","b","c")))
B = data.frame(Var3=c(-3, 2.7, 52))
merge(A,B)
## Var1 Var2 Var3 ## 1 1 a -3.0 ## 2 2 b -3.0 ## 3 3 c -3.0 ## 4 1 a 2.7 ## 5 2 b 2.7 ## 6 3 c 2.7 ## 7 1 a 52.0 ## 8 2 b 52.0 ## 9 3 c 52.0
A = data.frame(Var1 = c(1,2,3), Var2=factor(c("a","b","c")))
B = data.frame(Var2=factor(c("a","a","b")), Var3=c(-3, 2.7, 52))
merge(A,B)
## Var2 Var1 Var3 ## 1 a 1 -3.0 ## 2 a 1 2.7 ## 3 b 2 52.0
A = data.frame(Var1=c(1,2,3), Var2=factor(c("a","b","c")))
B = data.frame(Var1=c(9,10), Var2=factor(c("b","c")))
rbind(A,B)
## Var1 Var2 ## 1 1 a ## 2 2 b ## 3 3 c ## 4 9 b ## 5 10 c
But it isn’t “correct” for most data analysis purposes
| Survey-Question | Student-1 | Student-2 | Student-3 |
|---|---|---|---|
| Q1 | 4 | 3 | 3 |
| Q2 | 2 | 3 | 1 |
| Q3 | 3 | 4 | 4 |
| Survey-Question | Student | Response |
|---|---|---|
| Q1 | 1 | 4 |
| Q2 | 1 | 2 |
| Q3 | 1 | 3 |
| … . | .. . | .. |
| Q2 | 3 | 1 |
| Q3 | 3 | 4 |
library(reshape2)
mydata <- data.frame(Survey.Question=factor(c("Q1","Q2","Q3")),
Student.1=c(4,2,3),
Student.2=c(3,3,4),
Student.3=c(3,1,4))
mydata
## Survey.Question Student.1 Student.2 Student.3 ## 1 Q1 4 3 3 ## 2 Q2 2 3 1 ## 3 Q3 3 4 4
melt(mydata)
## Using Survey.Question as id variables
## Survey.Question variable value ## 1 Q1 Student.1 4 ## 2 Q2 Student.1 2 ## 3 Q3 Student.1 3 ## 4 Q1 Student.2 3 ## 5 Q2 Student.2 3 ## 6 Q3 Student.2 4 ## 7 Q1 Student.3 3 ## 8 Q2 Student.3 1 ## 9 Q3 Student.3 4
melt(mydata, id=1:2)
## Survey.Question Student.1 variable value ## 1 Q1 4 Student.2 3 ## 2 Q2 2 Student.2 3 ## 3 Q3 3 Student.2 4 ## 4 Q1 4 Student.3 3 ## 5 Q2 2 Student.3 1 ## 6 Q3 3 Student.3 4
melt(mydata, value.name="Question.Score", variable.name=c("Student"))
## Using Survey.Question as id variables
## Survey.Question Student Question.Score ## 1 Q1 Student.1 4 ## 2 Q2 Student.1 2 ## 3 Q3 Student.1 3 ## 4 Q1 Student.2 3 ## 5 Q2 Student.2 3 ## 6 Q3 Student.2 4 ## 7 Q1 Student.3 3 ## 8 Q2 Student.3 1 ## 9 Q3 Student.3 4
mydata <- ChickWeight mydata$NewColumn <- mydata$weight^2 head(mydata, n=5)
## weight Time Chick Diet NewColumn ## 1 42 0 1 1 1764 ## 2 51 2 1 1 2601 ## 3 59 4 1 1 3481 ## 4 64 6 1 1 4096 ## 5 76 8 1 1 5776
mydata <- ChickWeight
mutate(mydata,
NewColumn = weight^2,
SecNewCol = NewColumn / 2)
## weight Time Chick Diet NewColumn SecNewCol ## 1 42 0 1 1 1764 882.0 ## 2 51 2 1 1 2601 1300.5 ## 3 59 4 1 1 3481 1740.5 ## 4 64 6 1 1 4096 2048.0 ## 5 76 8 1 1 5776 2888.0 ## 6 93 10 1 1 8649 4324.5 ## 7 106 12 1 1 11236 5618.0 ## 8 125 14 1 1 15625 7812.5 ## 9 149 16 1 1 22201 11100.5 ## 10 171 18 1 1 29241 14620.5 ## 11 199 20 1 1 39601 19800.5 ## 12 205 21 1 1 42025 21012.5 ## 13 40 0 2 1 1600 800.0 ## 14 49 2 2 1 2401 1200.5 ## 15 58 4 2 1 3364 1682.0 ## 16 72 6 2 1 5184 2592.0 ## 17 84 8 2 1 7056 3528.0 ## 18 103 10 2 1 10609 5304.5 ## 19 122 12 2 1 14884 7442.0 ## 20 138 14 2 1 19044 9522.0 ## 21 162 16 2 1 26244 13122.0 ## 22 187 18 2 1 34969 17484.5 ## 23 209 20 2 1 43681 21840.5 ## 24 215 21 2 1 46225 23112.5 ## 25 43 0 3 1 1849 924.5 ## 26 39 2 3 1 1521 760.5 ## 27 55 4 3 1 3025 1512.5 ## 28 67 6 3 1 4489 2244.5 ## 29 84 8 3 1 7056 3528.0 ## 30 99 10 3 1 9801 4900.5 ## 31 115 12 3 1 13225 6612.5 ## 32 138 14 3 1 19044 9522.0 ## 33 163 16 3 1 26569 13284.5 ## 34 187 18 3 1 34969 17484.5 ## 35 198 20 3 1 39204 19602.0 ## 36 202 21 3 1 40804 20402.0 ## 37 42 0 4 1 1764 882.0 ## 38 49 2 4 1 2401 1200.5 ## 39 56 4 4 1 3136 1568.0 ## 40 67 6 4 1 4489 2244.5 ## 41 74 8 4 1 5476 2738.0 ## 42 87 10 4 1 7569 3784.5 ## 43 102 12 4 1 10404 5202.0 ## 44 108 14 4 1 11664 5832.0 ## 45 136 16 4 1 18496 9248.0 ## 46 154 18 4 1 23716 11858.0 ## 47 160 20 4 1 25600 12800.0 ## 48 157 21 4 1 24649 12324.5 ## 49 41 0 5 1 1681 840.5 ## 50 42 2 5 1 1764 882.0 ## 51 48 4 5 1 2304 1152.0 ## 52 60 6 5 1 3600 1800.0 ## 53 79 8 5 1 6241 3120.5 ## 54 106 10 5 1 11236 5618.0 ## 55 141 12 5 1 19881 9940.5 ## 56 164 14 5 1 26896 13448.0 ## 57 197 16 5 1 38809 19404.5 ## 58 199 18 5 1 39601 19800.5 ## 59 220 20 5 1 48400 24200.0 ## 60 223 21 5 1 49729 24864.5 ## 61 41 0 6 1 1681 840.5 ## 62 49 2 6 1 2401 1200.5 ## 63 59 4 6 1 3481 1740.5 ## 64 74 6 6 1 5476 2738.0 ## 65 97 8 6 1 9409 4704.5 ## 66 124 10 6 1 15376 7688.0 ## 67 141 12 6 1 19881 9940.5 ## 68 148 14 6 1 21904 10952.0 ## 69 155 16 6 1 24025 12012.5 ## 70 160 18 6 1 25600 12800.0 ## 71 160 20 6 1 25600 12800.0 ## 72 157 21 6 1 24649 12324.5 ## 73 41 0 7 1 1681 840.5 ## 74 49 2 7 1 2401 1200.5 ## 75 57 4 7 1 3249 1624.5 ## 76 71 6 7 1 5041 2520.5 ## 77 89 8 7 1 7921 3960.5 ## 78 112 10 7 1 12544 6272.0 ## 79 146 12 7 1 21316 10658.0 ## 80 174 14 7 1 30276 15138.0 ## 81 218 16 7 1 47524 23762.0 ## 82 250 18 7 1 62500 31250.0 ## 83 288 20 7 1 82944 41472.0 ## 84 305 21 7 1 93025 46512.5 ## 85 42 0 8 1 1764 882.0 ## 86 50 2 8 1 2500 1250.0 ## 87 61 4 8 1 3721 1860.5 ## 88 71 6 8 1 5041 2520.5 ## 89 84 8 8 1 7056 3528.0 ## 90 93 10 8 1 8649 4324.5 ## 91 110 12 8 1 12100 6050.0 ## 92 116 14 8 1 13456 6728.0 ## 93 126 16 8 1 15876 7938.0 ## 94 134 18 8 1 17956 8978.0 ## 95 125 20 8 1 15625 7812.5 ## 96 42 0 9 1 1764 882.0 ## 97 51 2 9 1 2601 1300.5 ## 98 59 4 9 1 3481 1740.5 ## 99 68 6 9 1 4624 2312.0 ## 100 85 8 9 1 7225 3612.5 ## 101 96 10 9 1 9216 4608.0 ## 102 90 12 9 1 8100 4050.0 ## 103 92 14 9 1 8464 4232.0 ## 104 93 16 9 1 8649 4324.5 ## 105 100 18 9 1 10000 5000.0 ## 106 100 20 9 1 10000 5000.0 ## 107 98 21 9 1 9604 4802.0 ## 108 41 0 10 1 1681 840.5 ## 109 44 2 10 1 1936 968.0 ## 110 52 4 10 1 2704 1352.0 ## 111 63 6 10 1 3969 1984.5 ## 112 74 8 10 1 5476 2738.0 ## 113 81 10 10 1 6561 3280.5 ## 114 89 12 10 1 7921 3960.5 ## 115 96 14 10 1 9216 4608.0 ## 116 101 16 10 1 10201 5100.5 ## 117 112 18 10 1 12544 6272.0 ## 118 120 20 10 1 14400 7200.0 ## 119 124 21 10 1 15376 7688.0 ## 120 43 0 11 1 1849 924.5 ## 121 51 2 11 1 2601 1300.5 ## 122 63 4 11 1 3969 1984.5 ## 123 84 6 11 1 7056 3528.0 ## 124 112 8 11 1 12544 6272.0 ## 125 139 10 11 1 19321 9660.5 ## 126 168 12 11 1 28224 14112.0 ## 127 177 14 11 1 31329 15664.5 ## 128 182 16 11 1 33124 16562.0 ## 129 184 18 11 1 33856 16928.0 ## 130 181 20 11 1 32761 16380.5 ## 131 175 21 11 1 30625 15312.5 ## 132 41 0 12 1 1681 840.5 ## 133 49 2 12 1 2401 1200.5 ## 134 56 4 12 1 3136 1568.0 ## 135 62 6 12 1 3844 1922.0 ## 136 72 8 12 1 5184 2592.0 ## 137 88 10 12 1 7744 3872.0 ## 138 119 12 12 1 14161 7080.5 ## 139 135 14 12 1 18225 9112.5 ## 140 162 16 12 1 26244 13122.0 ## 141 185 18 12 1 34225 17112.5 ## 142 195 20 12 1 38025 19012.5 ## 143 205 21 12 1 42025 21012.5 ## 144 41 0 13 1 1681 840.5 ## 145 48 2 13 1 2304 1152.0 ## 146 53 4 13 1 2809 1404.5 ## 147 60 6 13 1 3600 1800.0 ## 148 65 8 13 1 4225 2112.5 ## 149 67 10 13 1 4489 2244.5 ## 150 71 12 13 1 5041 2520.5 ## 151 70 14 13 1 4900 2450.0 ## 152 71 16 13 1 5041 2520.5 ## 153 81 18 13 1 6561 3280.5 ## 154 91 20 13 1 8281 4140.5 ## 155 96 21 13 1 9216 4608.0 ## 156 41 0 14 1 1681 840.5 ## 157 49 2 14 1 2401 1200.5 ## 158 62 4 14 1 3844 1922.0 ## 159 79 6 14 1 6241 3120.5 ## 160 101 8 14 1 10201 5100.5 ## 161 128 10 14 1 16384 8192.0 ## 162 164 12 14 1 26896 13448.0 ## 163 192 14 14 1 36864 18432.0 ## 164 227 16 14 1 51529 25764.5 ## 165 248 18 14 1 61504 30752.0 ## 166 259 20 14 1 67081 33540.5 ## 167 266 21 14 1 70756 35378.0 ## 168 41 0 15 1 1681 840.5 ## 169 49 2 15 1 2401 1200.5 ## 170 56 4 15 1 3136 1568.0 ## 171 64 6 15 1 4096 2048.0 ## 172 68 8 15 1 4624 2312.0 ## 173 68 10 15 1 4624 2312.0 ## 174 67 12 15 1 4489 2244.5 ## 175 68 14 15 1 4624 2312.0 ## 176 41 0 16 1 1681 840.5 ## 177 45 2 16 1 2025 1012.5 ## 178 49 4 16 1 2401 1200.5 ## 179 51 6 16 1 2601 1300.5 ## 180 57 8 16 1 3249 1624.5 ## 181 51 10 16 1 2601 1300.5 ## 182 54 12 16 1 2916 1458.0 ## 183 42 0 17 1 1764 882.0 ## 184 51 2 17 1 2601 1300.5 ## 185 61 4 17 1 3721 1860.5 ## 186 72 6 17 1 5184 2592.0 ## 187 83 8 17 1 6889 3444.5 ## 188 89 10 17 1 7921 3960.5 ## 189 98 12 17 1 9604 4802.0 ## 190 103 14 17 1 10609 5304.5 ## 191 113 16 17 1 12769 6384.5 ## 192 123 18 17 1 15129 7564.5 ## 193 133 20 17 1 17689 8844.5 ## 194 142 21 17 1 20164 10082.0 ## 195 39 0 18 1 1521 760.5 ## 196 35 2 18 1 1225 612.5 ## 197 43 0 19 1 1849 924.5 ## 198 48 2 19 1 2304 1152.0 ## 199 55 4 19 1 3025 1512.5 ## 200 62 6 19 1 3844 1922.0 ## 201 65 8 19 1 4225 2112.5 ## 202 71 10 19 1 5041 2520.5 ## 203 82 12 19 1 6724 3362.0 ## 204 88 14 19 1 7744 3872.0 ## 205 106 16 19 1 11236 5618.0 ## 206 120 18 19 1 14400 7200.0 ## 207 144 20 19 1 20736 10368.0 ## 208 157 21 19 1 24649 12324.5 ## 209 41 0 20 1 1681 840.5 ## 210 47 2 20 1 2209 1104.5 ## 211 54 4 20 1 2916 1458.0 ## 212 58 6 20 1 3364 1682.0 ## 213 65 8 20 1 4225 2112.5 ## 214 73 10 20 1 5329 2664.5 ## 215 77 12 20 1 5929 2964.5 ## 216 89 14 20 1 7921 3960.5 ## 217 98 16 20 1 9604 4802.0 ## 218 107 18 20 1 11449 5724.5 ## 219 115 20 20 1 13225 6612.5 ## 220 117 21 20 1 13689 6844.5 ## 221 40 0 21 2 1600 800.0 ## 222 50 2 21 2 2500 1250.0 ## 223 62 4 21 2 3844 1922.0 ## 224 86 6 21 2 7396 3698.0 ## 225 125 8 21 2 15625 7812.5 ## 226 163 10 21 2 26569 13284.5 ## 227 217 12 21 2 47089 23544.5 ## 228 240 14 21 2 57600 28800.0 ## 229 275 16 21 2 75625 37812.5 ## 230 307 18 21 2 94249 47124.5 ## 231 318 20 21 2 101124 50562.0 ## 232 331 21 21 2 109561 54780.5 ## 233 41 0 22 2 1681 840.5 ## 234 55 2 22 2 3025 1512.5 ## 235 64 4 22 2 4096 2048.0 ## 236 77 6 22 2 5929 2964.5 ## 237 90 8 22 2 8100 4050.0 ## 238 95 10 22 2 9025 4512.5 ## 239 108 12 22 2 11664 5832.0 ## 240 111 14 22 2 12321 6160.5 ## 241 131 16 22 2 17161 8580.5 ## 242 148 18 22 2 21904 10952.0 ## 243 164 20 22 2 26896 13448.0 ## 244 167 21 22 2 27889 13944.5 ## 245 43 0 23 2 1849 924.5 ## 246 52 2 23 2 2704 1352.0 ## 247 61 4 23 2 3721 1860.5 ## 248 73 6 23 2 5329 2664.5 ## 249 90 8 23 2 8100 4050.0 ## 250 103 10 23 2 10609 5304.5 ## 251 127 12 23 2 16129 8064.5 ## 252 135 14 23 2 18225 9112.5 ## 253 145 16 23 2 21025 10512.5 ## 254 163 18 23 2 26569 13284.5 ## 255 170 20 23 2 28900 14450.0 ## 256 175 21 23 2 30625 15312.5 ## 257 42 0 24 2 1764 882.0 ## 258 52 2 24 2 2704 1352.0 ## 259 58 4 24 2 3364 1682.0 ## 260 74 6 24 2 5476 2738.0 ## 261 66 8 24 2 4356 2178.0 ## 262 68 10 24 2 4624 2312.0 ## 263 70 12 24 2 4900 2450.0 ## 264 71 14 24 2 5041 2520.5 ## 265 72 16 24 2 5184 2592.0 ## 266 72 18 24 2 5184 2592.0 ## 267 76 20 24 2 5776 2888.0 ## 268 74 21 24 2 5476 2738.0 ## 269 40 0 25 2 1600 800.0 ## 270 49 2 25 2 2401 1200.5 ## 271 62 4 25 2 3844 1922.0 ## 272 78 6 25 2 6084 3042.0 ## 273 102 8 25 2 10404 5202.0 ## 274 124 10 25 2 15376 7688.0 ## 275 146 12 25 2 21316 10658.0 ## 276 164 14 25 2 26896 13448.0 ## 277 197 16 25 2 38809 19404.5 ## 278 231 18 25 2 53361 26680.5 ## 279 259 20 25 2 67081 33540.5 ## 280 265 21 25 2 70225 35112.5 ## 281 42 0 26 2 1764 882.0 ## 282 48 2 26 2 2304 1152.0 ## 283 57 4 26 2 3249 1624.5 ## 284 74 6 26 2 5476 2738.0 ## 285 93 8 26 2 8649 4324.5 ## 286 114 10 26 2 12996 6498.0 ## 287 136 12 26 2 18496 9248.0 ## 288 147 14 26 2 21609 10804.5 ## 289 169 16 26 2 28561 14280.5 ## 290 205 18 26 2 42025 21012.5 ## 291 236 20 26 2 55696 27848.0 ## 292 251 21 26 2 63001 31500.5 ## 293 39 0 27 2 1521 760.5 ## 294 46 2 27 2 2116 1058.0 ## 295 58 4 27 2 3364 1682.0 ## 296 73 6 27 2 5329 2664.5 ## 297 87 8 27 2 7569 3784.5 ## 298 100 10 27 2 10000 5000.0 ## 299 115 12 27 2 13225 6612.5 ## 300 123 14 27 2 15129 7564.5 ## 301 144 16 27 2 20736 10368.0 ## 302 163 18 27 2 26569 13284.5 ## 303 185 20 27 2 34225 17112.5 ## 304 192 21 27 2 36864 18432.0 ## 305 39 0 28 2 1521 760.5 ## 306 46 2 28 2 2116 1058.0 ## 307 58 4 28 2 3364 1682.0 ## 308 73 6 28 2 5329 2664.5 ## 309 92 8 28 2 8464 4232.0 ## 310 114 10 28 2 12996 6498.0 ## 311 145 12 28 2 21025 10512.5 ## 312 156 14 28 2 24336 12168.0 ## 313 184 16 28 2 33856 16928.0 ## 314 207 18 28 2 42849 21424.5 ## 315 212 20 28 2 44944 22472.0 ## 316 233 21 28 2 54289 27144.5 ## 317 39 0 29 2 1521 760.5 ## 318 48 2 29 2 2304 1152.0 ## 319 59 4 29 2 3481 1740.5 ## 320 74 6 29 2 5476 2738.0 ## 321 87 8 29 2 7569 3784.5 ## 322 106 10 29 2 11236 5618.0 ## 323 134 12 29 2 17956 8978.0 ## 324 150 14 29 2 22500 11250.0 ## 325 187 16 29 2 34969 17484.5 ## 326 230 18 29 2 52900 26450.0 ## 327 279 20 29 2 77841 38920.5 ## 328 309 21 29 2 95481 47740.5 ## 329 42 0 30 2 1764 882.0 ## 330 48 2 30 2 2304 1152.0 ## 331 59 4 30 2 3481 1740.5 ## 332 72 6 30 2 5184 2592.0 ## 333 85 8 30 2 7225 3612.5 ## 334 98 10 30 2 9604 4802.0 ## 335 115 12 30 2 13225 6612.5 ## 336 122 14 30 2 14884 7442.0 ## 337 143 16 30 2 20449 10224.5 ## 338 151 18 30 2 22801 11400.5 ## 339 157 20 30 2 24649 12324.5 ## 340 150 21 30 2 22500 11250.0 ## 341 42 0 31 3 1764 882.0 ## 342 53 2 31 3 2809 1404.5 ## 343 62 4 31 3 3844 1922.0 ## 344 73 6 31 3 5329 2664.5 ## 345 85 8 31 3 7225 3612.5 ## 346 102 10 31 3 10404 5202.0 ## 347 123 12 31 3 15129 7564.5 ## 348 138 14 31 3 19044 9522.0 ## 349 170 16 31 3 28900 14450.0 ## 350 204 18 31 3 41616 20808.0 ## 351 235 20 31 3 55225 27612.5 ## 352 256 21 31 3 65536 32768.0 ## 353 41 0 32 3 1681 840.5 ## 354 49 2 32 3 2401 1200.5 ## 355 65 4 32 3 4225 2112.5 ## 356 82 6 32 3 6724 3362.0 ## 357 107 8 32 3 11449 5724.5 ## 358 129 10 32 3 16641 8320.5 ## 359 159 12 32 3 25281 12640.5 ## 360 179 14 32 3 32041 16020.5 ## 361 221 16 32 3 48841 24420.5 ## 362 263 18 32 3 69169 34584.5 ## 363 291 20 32 3 84681 42340.5 ## 364 305 21 32 3 93025 46512.5 ## 365 39 0 33 3 1521 760.5 ## 366 50 2 33 3 2500 1250.0 ## 367 63 4 33 3 3969 1984.5 ## 368 77 6 33 3 5929 2964.5 ## 369 96 8 33 3 9216 4608.0 ## 370 111 10 33 3 12321 6160.5 ## 371 137 12 33 3 18769 9384.5 ## 372 144 14 33 3 20736 10368.0 ## 373 151 16 33 3 22801 11400.5 ## 374 146 18 33 3 21316 10658.0 ## 375 156 20 33 3 24336 12168.0 ## 376 147 21 33 3 21609 10804.5 ## 377 41 0 34 3 1681 840.5 ## 378 49 2 34 3 2401 1200.5 ## 379 63 4 34 3 3969 1984.5 ## 380 85 6 34 3 7225 3612.5 ## 381 107 8 34 3 11449 5724.5 ## 382 134 10 34 3 17956 8978.0 ## 383 164 12 34 3 26896 13448.0 ## 384 186 14 34 3 34596 17298.0 ## 385 235 16 34 3 55225 27612.5 ## 386 294 18 34 3 86436 43218.0 ## 387 327 20 34 3 106929 53464.5 ## 388 341 21 34 3 116281 58140.5 ## 389 41 0 35 3 1681 840.5 ## 390 53 2 35 3 2809 1404.5 ## 391 64 4 35 3 4096 2048.0 ## 392 87 6 35 3 7569 3784.5 ## 393 123 8 35 3 15129 7564.5 ## 394 158 10 35 3 24964 12482.0 ## 395 201 12 35 3 40401 20200.5 ## 396 238 14 35 3 56644 28322.0 ## 397 287 16 35 3 82369 41184.5 ## 398 332 18 35 3 110224 55112.0 ## 399 361 20 35 3 130321 65160.5 ## 400 373 21 35 3 139129 69564.5 ## 401 39 0 36 3 1521 760.5 ## 402 48 2 36 3 2304 1152.0 ## 403 61 4 36 3 3721 1860.5 ## 404 76 6 36 3 5776 2888.0 ## 405 98 8 36 3 9604 4802.0 ## 406 116 10 36 3 13456 6728.0 ## 407 145 12 36 3 21025 10512.5 ## 408 166 14 36 3 27556 13778.0 ## 409 198 16 36 3 39204 19602.0 ## 410 227 18 36 3 51529 25764.5 ## 411 225 20 36 3 50625 25312.5 ## 412 220 21 36 3 48400 24200.0 ## 413 41 0 37 3 1681 840.5 ## 414 48 2 37 3 2304 1152.0 ## 415 56 4 37 3 3136 1568.0 ## 416 68 6 37 3 4624 2312.0 ## 417 80 8 37 3 6400 3200.0 ## 418 83 10 37 3 6889 3444.5 ## 419 103 12 37 3 10609 5304.5 ## 420 112 14 37 3 12544 6272.0 ## 421 135 16 37 3 18225 9112.5 ## 422 157 18 37 3 24649 12324.5 ## 423 169 20 37 3 28561 14280.5 ## 424 178 21 37 3 31684 15842.0 ## 425 41 0 38 3 1681 840.5 ## 426 49 2 38 3 2401 1200.5 ## 427 61 4 38 3 3721 1860.5 ## 428 74 6 38 3 5476 2738.0 ## 429 98 8 38 3 9604 4802.0 ## 430 109 10 38 3 11881 5940.5 ## 431 128 12 38 3 16384 8192.0 ## 432 154 14 38 3 23716 11858.0 ## 433 192 16 38 3 36864 18432.0 ## 434 232 18 38 3 53824 26912.0 ## 435 280 20 38 3 78400 39200.0 ## 436 290 21 38 3 84100 42050.0 ## 437 42 0 39 3 1764 882.0 ## 438 50 2 39 3 2500 1250.0 ## 439 61 4 39 3 3721 1860.5 ## 440 78 6 39 3 6084 3042.0 ## 441 89 8 39 3 7921 3960.5 ## 442 109 10 39 3 11881 5940.5 ## 443 130 12 39 3 16900 8450.0 ## 444 146 14 39 3 21316 10658.0 ## 445 170 16 39 3 28900 14450.0 ## 446 214 18 39 3 45796 22898.0 ## 447 250 20 39 3 62500 31250.0 ## 448 272 21 39 3 73984 36992.0 ## 449 41 0 40 3 1681 840.5 ## 450 55 2 40 3 3025 1512.5 ## 451 66 4 40 3 4356 2178.0 ## 452 79 6 40 3 6241 3120.5 ## 453 101 8 40 3 10201 5100.5 ## 454 120 10 40 3 14400 7200.0 ## 455 154 12 40 3 23716 11858.0 ## 456 182 14 40 3 33124 16562.0 ## 457 215 16 40 3 46225 23112.5 ## 458 262 18 40 3 68644 34322.0 ## 459 295 20 40 3 87025 43512.5 ## 460 321 21 40 3 103041 51520.5 ## 461 42 0 41 4 1764 882.0 ## 462 51 2 41 4 2601 1300.5 ## 463 66 4 41 4 4356 2178.0 ## 464 85 6 41 4 7225 3612.5 ## 465 103 8 41 4 10609 5304.5 ## 466 124 10 41 4 15376 7688.0 ## 467 155 12 41 4 24025 12012.5 ## 468 153 14 41 4 23409 11704.5 ## 469 175 16 41 4 30625 15312.5 ## 470 184 18 41 4 33856 16928.0 ## 471 199 20 41 4 39601 19800.5 ## 472 204 21 41 4 41616 20808.0 ## 473 42 0 42 4 1764 882.0 ## 474 49 2 42 4 2401 1200.5 ## 475 63 4 42 4 3969 1984.5 ## 476 84 6 42 4 7056 3528.0 ## 477 103 8 42 4 10609 5304.5 ## 478 126 10 42 4 15876 7938.0 ## 479 160 12 42 4 25600 12800.0 ## 480 174 14 42 4 30276 15138.0 ## 481 204 16 42 4 41616 20808.0 ## 482 234 18 42 4 54756 27378.0 ## 483 269 20 42 4 72361 36180.5 ## 484 281 21 42 4 78961 39480.5 ## 485 42 0 43 4 1764 882.0 ## 486 55 2 43 4 3025 1512.5 ## 487 69 4 43 4 4761 2380.5 ## 488 96 6 43 4 9216 4608.0 ## 489 131 8 43 4 17161 8580.5 ## 490 157 10 43 4 24649 12324.5 ## 491 184 12 43 4 33856 16928.0 ## 492 188 14 43 4 35344 17672.0 ## 493 197 16 43 4 38809 19404.5 ## 494 198 18 43 4 39204 19602.0 ## 495 199 20 43 4 39601 19800.5 ## 496 200 21 43 4 40000 20000.0 ## 497 42 0 44 4 1764 882.0 ## 498 51 2 44 4 2601 1300.5 ## 499 65 4 44 4 4225 2112.5 ## 500 86 6 44 4 7396 3698.0 ## 501 103 8 44 4 10609 5304.5 ## 502 118 10 44 4 13924 6962.0 ## 503 127 12 44 4 16129 8064.5 ## 504 138 14 44 4 19044 9522.0 ## 505 145 16 44 4 21025 10512.5 ## 506 146 18 44 4 21316 10658.0 ## 507 41 0 45 4 1681 840.5 ## 508 50 2 45 4 2500 1250.0 ## 509 61 4 45 4 3721 1860.5 ## 510 78 6 45 4 6084 3042.0 ## 511 98 8 45 4 9604 4802.0 ## 512 117 10 45 4 13689 6844.5 ## 513 135 12 45 4 18225 9112.5 ## 514 141 14 45 4 19881 9940.5 ## 515 147 16 45 4 21609 10804.5 ## 516 174 18 45 4 30276 15138.0 ## 517 197 20 45 4 38809 19404.5 ## 518 196 21 45 4 38416 19208.0 ## 519 40 0 46 4 1600 800.0 ## 520 52 2 46 4 2704 1352.0 ## 521 62 4 46 4 3844 1922.0 ## 522 82 6 46 4 6724 3362.0 ## 523 101 8 46 4 10201 5100.5 ## 524 120 10 46 4 14400 7200.0 ## 525 144 12 46 4 20736 10368.0 ## 526 156 14 46 4 24336 12168.0 ## 527 173 16 46 4 29929 14964.5 ## 528 210 18 46 4 44100 22050.0 ## 529 231 20 46 4 53361 26680.5 ## 530 238 21 46 4 56644 28322.0 ## 531 41 0 47 4 1681 840.5 ## 532 53 2 47 4 2809 1404.5 ## 533 66 4 47 4 4356 2178.0 ## 534 79 6 47 4 6241 3120.5 ## 535 100 8 47 4 10000 5000.0 ## 536 123 10 47 4 15129 7564.5 ## 537 148 12 47 4 21904 10952.0 ## 538 157 14 47 4 24649 12324.5 ## 539 168 16 47 4 28224 14112.0 ## 540 185 18 47 4 34225 17112.5 ## 541 210 20 47 4 44100 22050.0 ## 542 205 21 47 4 42025 21012.5 ## 543 39 0 48 4 1521 760.5 ## 544 50 2 48 4 2500 1250.0 ## 545 62 4 48 4 3844 1922.0 ## 546 80 6 48 4 6400 3200.0 ## 547 104 8 48 4 10816 5408.0 ## 548 125 10 48 4 15625 7812.5 ## 549 154 12 48 4 23716 11858.0 ## 550 170 14 48 4 28900 14450.0 ## 551 222 16 48 4 49284 24642.0 ## 552 261 18 48 4 68121 34060.5 ## 553 303 20 48 4 91809 45904.5 ## 554 322 21 48 4 103684 51842.0 ## 555 40 0 49 4 1600 800.0 ## 556 53 2 49 4 2809 1404.5 ## 557 64 4 49 4 4096 2048.0 ## 558 85 6 49 4 7225 3612.5 ## 559 108 8 49 4 11664 5832.0 ## 560 128 10 49 4 16384 8192.0 ## 561 152 12 49 4 23104 11552.0 ## 562 166 14 49 4 27556 13778.0 ## 563 184 16 49 4 33856 16928.0 ## 564 203 18 49 4 41209 20604.5 ## 565 233 20 49 4 54289 27144.5 ## 566 237 21 49 4 56169 28084.5 ## 567 41 0 50 4 1681 840.5 ## 568 54 2 50 4 2916 1458.0 ## 569 67 4 50 4 4489 2244.5 ## 570 84 6 50 4 7056 3528.0 ## 571 105 8 50 4 11025 5512.5 ## 572 122 10 50 4 14884 7442.0 ## 573 155 12 50 4 24025 12012.5 ## 574 175 14 50 4 30625 15312.5 ## 575 205 16 50 4 42025 21012.5 ## 576 234 18 50 4 54756 27378.0 ## 577 264 20 50 4 69696 34848.0 ## 578 264 21 50 4 69696 34848.0
head(mydata)
## weight Time Chick Diet ## 1 42 0 1 1 ## 2 51 2 1 1 ## 3 59 4 1 1 ## 4 64 6 1 1 ## 5 76 8 1 1 ## 6 93 10 1 1
myStates = mutate(as.data.frame(state.x77), Region=state.region) group_by(myStates,Region)
## # A tibble: 50 x 9 ## # Groups: Region [4] ## Population Income Illiteracy `Life Exp` Murder `HS Grad` Frost Area ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 3615 3624 2.1 69.0 15.1 41.3 20 50708 ## 2 365 6315 1.5 69.3 11.3 66.7 152 566432 ## 3 2212 4530 1.8 70.6 7.8 58.1 15 113417 ## 4 2110 3378 1.9 70.7 10.1 39.9 65 51945 ## 5 21198 5114 1.1 71.7 10.3 62.6 20 156361 ## 6 2541 4884 0.7 72.1 6.8 63.9 166 103766 ## 7 3100 5348 1.1 72.5 3.1 56 139 4862 ## 8 579 4809 0.9 70.1 6.2 54.6 103 1982 ## 9 8277 4815 1.3 70.7 10.7 52.6 11 54090 ## 10 4931 4091 2 68.5 13.9 40.6 60 58073 ## # … with 40 more rows, and 1 more variable: Region <fct>
myStates = mutate(as.data.frame(state.x77), Region=state.region)
summarise(group_by(myStates,Region),
AvgPopulationSize=mean(Population),
SDPopulationSize=sqrt(var(Population)))
## # A tibble: 4 x 3 ## Region AvgPopulationSize SDPopulationSize ## <fct> <dbl> <dbl> ## 1 Northeast 5495. 6080. ## 2 South 4208. 2780. ## 3 North Central 4803 3703. ## 4 West 2915. 5579.
library(ggplot2)
myStates = mutate(as.data.frame(state.x77), Region=state.region)
RegionSummary = summarise(group_by(myStates,Region),
AvgPopulationSize=mean(Population),
SDPopulationSize=sqrt(var(Population)))
ggplot(RegionSummary, aes(x=Region, y=AvgPopulationSize)) +
geom_errorbar(aes(ymin=AvgPopulationSize-SDPopulationSize/2,
ymax=AvgPopulationSize+SDPopulationSize/2),
width=0.4, size=0.75) +
geom_point(shape=21, size=10, fill="white") +
xlab("US Region") + ylab("Average State Population Size in Region") +
theme(text=element_text(size=18))
We can use R to construct statistical models
Typically, we have to tell R what the response and explanatory variables of the model are using something called a formula
You’ll see the R notation for this again when we study facets
response variable ~ explanatory variables
\[ r = \frac{n\left(\sum_{i=1}^{n} \sum_{j=1}^{n}x_iy_j\right) - \left(\sum_{i=1}^{n}x_i\right)\left(\sum_{j=1}^{n}y_j\right)} {\sqrt{\left(n\sum_{i=1}^{n}x_i^2 -(\sum_{i=1}^{n}x_i)^2\right) \left(n\sum_{j=1}^{n}y_j^2 -(\sum_{j=1}^{n}y_j)^2\right)}}\]
For linear models, we use the function lm()
lm() takes the data set and the formula, then returns a model
fit = lm(data=Orange,formula= age ~ circumference) summary(fit)
## ## Call: ## lm(formula = age ~ circumference, data = Orange) ## ## Residuals: ## Min 1Q Median 3Q Max ## -317.88 -140.90 -17.20 96.54 471.16 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 16.6036 78.1406 0.212 0.833 ## circumference 7.8160 0.6059 12.900 1.93e-14 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 203.1 on 33 degrees of freedom ## Multiple R-squared: 0.8345, Adjusted R-squared: 0.8295 ## F-statistic: 166.4 on 1 and 33 DF, p-value: 1.931e-14
\(r^2\) is an ad-hoc measure of variance from beteween actual and predicted values from the model
ds <- mtcars ds$arbitrary = runif(length(mtcars$mpg)) # Add an obviously uncorrelated variable summary(lm(mpg ~ hp, data=ds))$r.squared
## [1] 0.6024373
summary(lm(mpg ~ hp + wt, data=ds))$r.squared
## [1] 0.8267855
summary(lm(mpg ~ hp + wt + arbitrary, data=ds))$r.squared
## [1] 0.8424729
# Somehow including the arbitrary variable improved the fit?
# using ds from last slide summary(lm(mpg ~ hp, data=ds))$adj.r.squared
## [1] 0.5891853
summary(lm(mpg ~ hp + wt, data=ds))$adj.r.squared
## [1] 0.8148396
summary(lm(mpg ~ hp + wt + arbitrary, data=ds))$adj.r.squared
## [1] 0.825595
# No, the fit is worse once we adjust for the variable's contributions ...
Suppose I have explanatory variable \(x\), as well as a variable \(y\) that depends on \(x\) by a quadratic function
This means that there’s some function: \[y = \alpha_0 + \alpha_1 x^2 + \alpha_2 x\]
But this is just like a linear function, if \(x^2\) were it’s own variable! \[y = \alpha_0 + \alpha_1 z + \alpha_2 x\]
fit = lm(data=hatcolor,formula= y ~ x^2 + x)
fit = lm(data=hatcolor,formula= y ~ sin(x) + x + x^3)
fit = lm(data=hatcolor,formula= y ~ poly(x,2)); summary(fit)
hatcolorURL = "http://cs.ucf.edu/~wiegand/ids6938/datasets/hatcolor.csv" hatcolor = read.table(hatcolorURL,header=TRUE) summary(lm(data=hatcolor,formula=Coolitude ~ HatLightness))$r.squared
## [1] 0.005302867
summary(lm(data=hatcolor,formula=Coolitude ~ HatLightness))$adj.r.squared
## [1] -0.01541999
summary(lm(data=hatcolor,formula=Coolitude ~ poly(HatLightness,2)))$r.squared
## [1] 0.8888319
summary(lm(data=hatcolor,formula=Coolitude ~ poly(HatLightness,2)))$adj.r.squared
## [1] 0.8841013
# Build the Model
carModel = lm(data=mtcars, formula=mpg ~ wt + factor(cyl))
# Create the dataset over which to make predictions
newCarData = data.frame(wt=c(2.5,1.9),
cyl=factor(c(4,6), levels= levels(factor(mtcars$cyl))))
# Make predictions
newCarData$mpg = predict(carModel, newdata=newCarData)
print(newCarData)
## wt cyl mpg ## 1 2.5 4 25.97676 ## 2 1.9 6 23.64455
The lm() function assumes the mean value for response variable modeled by the linear function over the explanatory variables deviates by a normal distribution
I.e.,: \(y = \alpha_0 + \left(\sum_{i=1}^n \alpha_i x_i\right) + {\cal N}(\mu,\sigma)\)
That is, that points more or less follow the model except for an additive error that is both Normal and i.i.d.
But what if the distribution is different?
Generalized linear models extend traditional methods to allow the mean to depend on the explanatory variable through a link function, and use different kinds of distributions
This is particularly significant with the response variable is categorical, rather than numeric
In R, you can use glm() to perform such modeling
Suppose you have a dataset with different student applicants to a graduate program, each with verbal, quantitative, and analytical scores on the GRE, as well as a GPA value and several other numeric measures
You want a predictive model as to whether a given applicant will be accepted or not
The response variable is categorical (1 or 0, accepted or not) and the explanatory variables are numeric
You can’t just use regular linear regression, you must use logistic regression (“logit”), which models the response variable using a binomial distribution
acceptModel = glm(data=somedata, formula = accept ~ ., family=binomial)
We can do a number of things with such a model:
Get the coefficients for the model
Get confidence intervals for the different explanatory variables for acceptance
Predict the probability whether a new point(s) would be accepted or not
acceptModel = glm(data=somedata, formula = accept ~ ., family=binomial)
coef(acceptModel)
exp(confint.default(acceptModel))
predict.glm(acceptModel,
data.frame(verbal=155,quant=160,analytic=4,gpa=3.6),
type="response",
se.fit=TRUE)
You can weight different points in the data set differently (i.e., weighted linear regression) by giving lm() or glm() a weights vector
You can use other non-linear models (e.g., an exponential function of the explanatory variables)
You can use local regression, loess(), which uses a \(k-\)nearest neighbor type approach to produce a piece-wise continuous curve that fits points as well as possible in local regions of the space
Soon, we’ll use the R notation of formulas when using facets to build trellis displays