From the Ecdat Package, I selected a dataset that contained N=308 observations. The data looks into different factors that affect the pricing of the diamond carats. However, the study selected a pair of categorical independent variables and a continuous variable dependent variable. Also, the data was organized so that one independent variable was blocked. Further analysis were made regarding the study.
The data was copied and pasted into Excel, and then saved as a .csv. Shown below is the loaded data into R.
#Load the data
diamond = read.csv("diamonds.csv", header = TRUE, sep=",")
diamond
## price color certification
## 1 1302 D GIA
## 2 1510 E GIA
## 3 1510 G GIA
## 4 1260 G GIA
## 5 1641 D GIA
## 6 1555 E GIA
## 7 1427 F GIA
## 8 1427 G GIA
## 9 1126 H GIA
## 10 1126 I GIA
## 11 1468 F GIA
## 12 1202 G GIA
## 13 1327 E GIA
## 14 1098 I GIA
## 15 1693 E GIA
## 16 1551 F GIA
## 17 1410 G GIA
## 18 1269 G GIA
## 19 1316 H GIA
## 20 1222 H GIA
## 21 1738 E GIA
## 22 1593 F GIA
## 23 1447 G GIA
## 24 1255 H GIA
## 25 1635 F GIA
## 26 1485 H GIA
## 27 1420 F GIA
## 28 1420 H GIA
## 29 1911 F GIA
## 30 1525 H GIA
## 31 1956 F GIA
## 32 1747 H GIA
## 33 1572 I GIA
## 34 2942 E GIA
## 35 2532 G GIA
## 36 3501 E GIA
## 37 3501 E GIA
## 38 3501 F GIA
## 39 3293 F GIA
## 40 3016 G GIA
## 41 3567 F GIA
## 42 3205 G GIA
## 43 3490 D GIA
## 44 3635 E GIA
## 45 3635 F GIA
## 46 3418 F GIA
## 47 3921 D GIA
## 48 3701 F GIA
## 49 3480 F GIA
## 50 3407 G GIA
## 51 3767 E GIA
## 52 4066 F GIA
## 53 4138 E GIA
## 54 3605 F GIA
## 55 3529 G GIA
## 56 3667 F GIA
## 57 2892 I GIA
## 58 3651 G GIA
## 59 3773 G GIA
## 60 4291 F GIA
## 61 5845 E GIA
## 62 4401 G GIA
## 63 4759 G GIA
## 64 4300 H GIA
## 65 5510 F GIA
## 66 5122 G GIA
## 67 5122 H GIA
## 68 3861 I GIA
## 69 5881 F GIA
## 70 5586 F GIA
## 71 5193 F GIA
## 72 5193 H GIA
## 73 5263 F GIA
## 74 5441 I GIA
## 75 4948 I GIA
## 76 5705 H GIA
## 77 6805 F GIA
## 78 6882 H GIA
## 79 6709 H GIA
## 80 6682 I GIA
## 81 3501 E GIA
## 82 3432 G GIA
## 83 3851 F GIA
## 84 3605 H GIA
## 85 3900 E GIA
## 86 3415 H GIA
## 87 4291 H GIA
## 88 6512 E GIA
## 89 5800 E GIA
## 90 6285 F GIA
## 91 5122 F GIA
## 92 5122 F GIA
## 93 5122 G GIA
## 94 5122 H GIA
## 95 6372 D GIA
## 96 5881 E GIA
## 97 5193 H GIA
## 98 5961 E GIA
## 99 5662 H GIA
## 100 5738 E GIA
## 101 5030 H GIA
## 102 5030 H GIA
## 103 4727 I GIA
## 104 4221 I GIA
## 105 5815 G GIA
## 106 4585 H GIA
## 107 7368 D GIA
## 108 4667 I GIA
## 109 4355 I GIA
## 110 9885 D GIA
## 111 6919 F GIA
## 112 5386 H GIA
## 113 4832 I GIA
## 114 7156 E GIA
## 115 7680 F GIA
## 116 15582 D GIA
## 117 11419 D GIA
## 118 10588 E GIA
## 119 9757 E GIA
## 120 13913 F GIA
## 121 10588 F GIA
## 122 10713 F GIA
## 123 9480 F GIA
## 124 9896 G GIA
## 125 9619 G GIA
## 126 9169 G GIA
## 127 9203 G GIA
## 128 8788 H GIA
## 129 8095 I GIA
## 130 7818 I GIA
## 131 16008 D GIA
## 132 10692 E GIA
## 133 9853 E GIA
## 134 10272 F GIA
## 135 9573 F GIA
## 136 9153 H GIA
## 137 8873 H GIA
## 138 8873 I GIA
## 139 8455 I GIA
## 140 7895 I GIA
## 141 10372 F GIA
## 142 9666 F GIA
## 143 10090 G GIA
## 144 10900 E GIA
## 145 10571 F GIA
## 146 9563 I GIA
## 147 8781 I GIA
## 148 9743 G GIA
## 149 9302 H GIA
## 150 8945 I GIA
## 151 9646 H GIA
## 152 823 F IGI
## 153 765 F IGI
## 154 803 G IGI
## 155 803 G IGI
## 156 705 G IGI
## 157 725 H IGI
## 158 967 D IGI
## 159 1050 E IGI
## 160 967 F IGI
## 161 863 F IGI
## 162 800 F IGI
## 163 842 G IGI
## 164 800 G IGI
## 165 758 H IGI
## 166 880 D IGI
## 167 880 G IGI
## 168 705 G IGI
## 169 638 G IGI
## 170 919 D IGI
## 171 1149 E IGI
## 172 1057 F IGI
## 173 919 G IGI
## 174 1198 E IGI
## 175 1248 E IGI
## 176 1147 F IGI
## 177 995 G IGI
## 178 1108 H IGI
## 179 1485 F IGI
## 180 1283 G IGI
## 181 1149 H IGI
## 182 1082 I IGI
## 183 1539 F IGI
## 184 1365 F IGI
## 185 1260 F IGI
## 186 1121 I IGI
## 187 1595 F IGI
## 188 1233 H IGI
## 189 1199 I IGI
## 190 1471 G IGI
## 191 1238 I IGI
## 192 1580 E IGI
## 193 1459 F IGI
## 194 1459 G IGI
## 195 1218 H IGI
## 196 1299 I IGI
## 197 1628 E IGI
## 198 1628 F IGI
## 199 1337 I IGI
## 200 1462 H IGI
## 201 1503 H IGI
## 202 1773 F IGI
## 203 1636 F IGI
## 204 1821 F IGI
## 205 1540 G IGI
## 206 2276 G IGI
## 207 1616 I IGI
## 208 1506 I IGI
## 209 2651 F IGI
## 210 2383 F IGI
## 211 3652 G IGI
## 212 3722 E IGI
## 213 3722 F IGI
## 214 3095 I IGI
## 215 3706 F IGI
## 216 4070 E IGI
## 217 3470 G IGI
## 218 4831 E IGI
## 219 4209 F IGI
## 220 3821 G IGI
## 221 5607 G IGI
## 222 5326 G IGI
## 223 6160 D IGI
## 224 6095 F IGI
## 225 5937 G IGI
## 226 9342 H IGI
## 227 9713 G IGI
## 228 8873 H IGI
## 229 8175 I IGI
## 230 3778 F HRD
## 231 3432 G HRD
## 232 3851 F HRD
## 233 3346 E HRD
## 234 3130 H HRD
## 235 3995 F HRD
## 236 3701 F HRD
## 237 3529 G HRD
## 238 3667 F HRD
## 239 3202 F HRD
## 240 3256 F HRD
## 241 3415 H HRD
## 242 3792 H HRD
## 243 3925 G HRD
## 244 3421 G HRD
## 245 3925 H HRD
## 246 3616 H HRD
## 247 3615 I HRD
## 248 3785 H HRD
## 249 3643 I HRD
## 250 4300 H HRD
## 251 6867 E HRD
## 252 6285 E HRD
## 253 5800 G HRD
## 254 5510 G HRD
## 255 4346 H HRD
## 256 6372 G HRD
## 257 5193 H HRD
## 258 5662 H HRD
## 259 5333 F HRD
## 260 6041 G HRD
## 261 5815 H HRD
## 262 8611 F HRD
## 263 6905 F HRD
## 264 6905 G HRD
## 265 6416 H HRD
## 266 6051 H HRD
## 267 8715 E HRD
## 268 6988 E HRD
## 269 6988 F HRD
## 270 6495 G HRD
## 271 7358 H HRD
## 272 6572 F HRD
## 273 7072 G HRD
## 274 8359 F HRD
## 275 6805 F HRD
## 276 7711 G HRD
## 277 5835 H HRD
## 278 13775 D HRD
## 279 14051 E HRD
## 280 11419 E HRD
## 281 10588 E HRD
## 282 11696 F HRD
## 283 10588 F HRD
## 284 10450 G HRD
## 285 9896 G HRD
## 286 9203 G HRD
## 287 9480 H HRD
## 288 9065 H HRD
## 289 8788 H HRD
## 290 8788 I HRD
## 291 8372 I HRD
## 292 8095 I HRD
## 293 7818 I HRD
## 294 13909 D HRD
## 295 11531 E HRD
## 296 10692 E HRD
## 297 11811 F HRD
## 298 10272 F HRD
## 299 9993 G HRD
## 300 9293 G HRD
## 301 9433 H HRD
## 302 9153 H HRD
## 303 8873 I HRD
## 304 8175 I HRD
## 305 10796 F HRD
## 306 9890 H HRD
## 307 8959 H HRD
## 308 9107 I HRD
The dataset helps determine the effects of color and certification to the pricing of the diamond. Initially, the dataset consisted of many factors. However, I chose two independent variables that were of interest based purely on my preference.
I selected the color and certification to examine their effects. In addition, I chose which indepdendent variable to block. I blocked the variable that seemed to have less of an effect on price. Therefore, I decided to block certification because I thought certification had to do more with the institution certifying the diamond and less with the value of the diamond itself, like the color of the diamon. I was interested in studying a Randomized Complete Block Design.
Below are the definitions of the variables:
price: the price of the diamond in Singapore $
color: the color of the diamond with a factor of levels (D,E,F,G,H,I)
certification: the certification body with a factor of levels (GIA,IGI,HRD)
#Display the descriptive statistics
summary(diamond)
## price color certification
## Min. : 638 D:16 GIA:151
## 1st Qu.: 1625 E:44 HRD: 79
## Median : 4215 F:82 IGI: 78
## Mean : 5019 G:65
## 3rd Qu.: 7446 H:61
## Max. :16008 I:40
#System under test
head(diamond)
## price color certification
## 1 1302 D GIA
## 2 1510 E GIA
## 3 1510 G GIA
## 4 1260 G GIA
## 5 1641 D GIA
## 6 1555 E GIA
names(diamond)
## [1] "price" "color" "certification"
The data contains 3 factors and several levels. The as.factors was not necessary because they were categorical.
#Display the structure
str(diamond)
## 'data.frame': 308 obs. of 3 variables:
## $ price : int 1302 1510 1510 1260 1641 1555 1427 1427 1126 1126 ...
## $ color : Factor w/ 6 levels "D","E","F","G",..: 1 2 4 4 1 2 3 4 5 6 ...
## $ certification: Factor w/ 3 levels "GIA","HRD","IGI": 1 1 1 1 1 1 1 1 1 1 ...
#Check the levels
levels(diamond$price)
## NULL
levels(diamond$color)
## [1] "D" "E" "F" "G" "H" "I"
levels(diamond$certification)
## [1] "GIA" "HRD" "IGI"
To summarize, the levels are:
color: D,E,F,G,H,I
certification: GIA,IGI,HRD
The data had one continuous variable that was also the dependent variable. It was the price.
The main objective of the experiment was to determine the effects of pricing on diamond. Therefore, the price variable also represents the response variable as well.
The data consists of N=308 obversations with 3 variables. The data was collected in the year 2000 and the country the data came from was Singapore.
Here are the organizations and views of the data:
summary(diamond)
## price color certification
## Min. : 638 D:16 GIA:151
## 1st Qu.: 1625 E:44 HRD: 79
## Median : 4215 F:82 IGI: 78
## Mean : 5019 G:65
## 3rd Qu.: 7446 H:61
## Max. :16008 I:40
head(diamond)
## price color certification
## 1 1302 D GIA
## 2 1510 E GIA
## 3 1510 G GIA
## 4 1260 G GIA
## 5 1641 D GIA
## 6 1555 E GIA
tail(diamond)
## price color certification
## 303 8873 I HRD
## 304 8175 I HRD
## 305 10796 F HRD
## 306 9890 H HRD
## 307 8959 H HRD
## 308 9107 I HRD
Next, let’s look at the block design.
To consider for blocking, I needed to organize the data so that I could block on one independent variable and test a hypothesis on the second independent variable. Therefore, I chose certification as the variable to block. The color was the treatment. Take a closer look at the re-organized dataset:
#Organize the data to treatments and block
data = read.csv("block.csv", header = TRUE, sep=",")
data
## price treatment block
## 1 1302 D 1
## 2 1510 E 1
## 3 1510 G 1
## 4 1260 G 1
## 5 1641 D 1
## 6 1555 E 1
## 7 1427 F 1
## 8 1427 G 1
## 9 1126 H 1
## 10 1126 I 1
## 11 1468 F 1
## 12 1202 G 1
## 13 1327 E 1
## 14 1098 I 1
## 15 1693 E 1
## 16 1551 F 1
## 17 1410 G 1
## 18 1269 G 1
## 19 1316 H 1
## 20 1222 H 1
## 21 1738 E 1
## 22 1593 F 1
## 23 1447 G 1
## 24 1255 H 1
## 25 1635 F 1
## 26 1485 H 1
## 27 1420 F 1
## 28 1420 H 1
## 29 1911 F 1
## 30 1525 H 1
## 31 1956 F 1
## 32 1747 H 1
## 33 1572 I 1
## 34 2942 E 1
## 35 2532 G 1
## 36 3501 E 1
## 37 3501 E 1
## 38 3501 F 1
## 39 3293 F 1
## 40 3016 G 1
## 41 3567 F 1
## 42 3205 G 1
## 43 3490 D 1
## 44 3635 E 1
## 45 3635 F 1
## 46 3418 F 1
## 47 3921 D 1
## 48 3701 F 1
## 49 3480 F 1
## 50 3407 G 1
## 51 3767 E 1
## 52 4066 F 1
## 53 4138 E 1
## 54 3605 F 1
## 55 3529 G 1
## 56 3667 F 1
## 57 2892 I 1
## 58 3651 G 1
## 59 3773 G 1
## 60 4291 F 1
## 61 5845 E 1
## 62 4401 G 1
## 63 4759 G 1
## 64 4300 H 1
## 65 5510 F 1
## 66 5122 G 1
## 67 5122 H 1
## 68 3861 I 1
## 69 5881 F 1
## 70 5586 F 1
## 71 5193 F 1
## 72 5193 H 1
## 73 5263 F 1
## 74 5441 I 1
## 75 4948 I 1
## 76 5705 H 1
## 77 6805 F 1
## 78 6882 H 1
## 79 6709 H 1
## 80 6682 I 1
## 81 3501 E 1
## 82 3432 G 1
## 83 3851 F 1
## 84 3605 H 1
## 85 3900 E 1
## 86 3415 H 1
## 87 4291 H 1
## 88 6512 E 1
## 89 5800 E 1
## 90 6285 F 1
## 91 5122 F 1
## 92 5122 F 1
## 93 5122 G 1
## 94 5122 H 1
## 95 6372 D 1
## 96 5881 E 1
## 97 5193 H 1
## 98 5961 E 1
## 99 5662 H 1
## 100 5738 E 1
## 101 5030 H 1
## 102 5030 H 1
## 103 4727 I 1
## 104 4221 I 1
## 105 5815 G 1
## 106 4585 H 1
## 107 7368 D 1
## 108 4667 I 1
## 109 4355 I 1
## 110 9885 D 1
## 111 6919 F 1
## 112 5386 H 1
## 113 4832 I 1
## 114 7156 E 1
## 115 7680 F 1
## 116 15582 D 1
## 117 11419 D 1
## 118 10588 E 1
## 119 9757 E 1
## 120 13913 F 1
## 121 10588 F 1
## 122 10713 F 1
## 123 9480 F 1
## 124 9896 G 1
## 125 9619 G 1
## 126 9169 G 1
## 127 9203 G 1
## 128 8788 H 1
## 129 8095 I 1
## 130 7818 I 1
## 131 16008 D 1
## 132 10692 E 1
## 133 9853 E 1
## 134 10272 F 1
## 135 9573 F 1
## 136 9153 H 1
## 137 8873 H 1
## 138 8873 I 1
## 139 8455 I 1
## 140 7895 I 1
## 141 10372 F 1
## 142 9666 F 1
## 143 10090 G 1
## 144 10900 E 1
## 145 10571 F 1
## 146 9563 I 1
## 147 8781 I 1
## 148 9743 G 1
## 149 9302 H 1
## 150 8945 I 1
## 151 9646 H 1
## 152 823 F 2
## 153 765 F 2
## 154 803 G 2
## 155 803 G 2
## 156 705 G 2
## 157 725 H 2
## 158 967 D 2
## 159 1050 E 2
## 160 967 F 2
## 161 863 F 2
## 162 800 F 2
## 163 842 G 2
## 164 800 G 2
## 165 758 H 2
## 166 880 D 2
## 167 880 G 2
## 168 705 G 2
## 169 638 G 2
## 170 919 D 2
## 171 1149 E 2
## 172 1057 F 2
## 173 919 G 2
## 174 1198 E 2
## 175 1248 E 2
## 176 1147 F 2
## 177 995 G 2
## 178 1108 H 2
## 179 1485 F 2
## 180 1283 G 2
## 181 1149 H 2
## 182 1082 I 2
## 183 1539 F 2
## 184 1365 F 2
## 185 1260 F 2
## 186 1121 I 2
## 187 1595 F 2
## 188 1233 H 2
## 189 1199 I 2
## 190 1471 G 2
## 191 1238 I 2
## 192 1580 E 2
## 193 1459 F 2
## 194 1459 G 2
## 195 1218 H 2
## 196 1299 I 2
## 197 1628 E 2
## 198 1628 F 2
## 199 1337 I 2
## 200 1462 H 2
## 201 1503 H 2
## 202 1773 F 2
## 203 1636 F 2
## 204 1821 F 2
## 205 1540 G 2
## 206 2276 G 2
## 207 1616 I 2
## 208 1506 I 2
## 209 2651 F 2
## 210 2383 F 2
## 211 3652 G 2
## 212 3722 E 2
## 213 3722 F 2
## 214 3095 I 2
## 215 3706 F 2
## 216 4070 E 2
## 217 3470 G 2
## 218 4831 E 2
## 219 4209 F 2
## 220 3821 G 2
## 221 5607 G 2
## 222 5326 G 2
## 223 6160 D 2
## 224 6095 F 2
## 225 5937 G 2
## 226 9342 H 2
## 227 9713 G 2
## 228 8873 H 2
## 229 8175 I 2
## 230 3778 F 3
## 231 3432 G 3
## 232 3851 F 3
## 233 3346 E 3
## 234 3130 H 3
## 235 3995 F 3
## 236 3701 F 3
## 237 3529 G 3
## 238 3667 F 3
## 239 3202 F 3
## 240 3256 F 3
## 241 3415 H 3
## 242 3792 H 3
## 243 3925 G 3
## 244 3421 G 3
## 245 3925 H 3
## 246 3616 H 3
## 247 3615 I 3
## 248 3785 H 3
## 249 3643 I 3
## 250 4300 H 3
## 251 6867 E 3
## 252 6285 E 3
## 253 5800 G 3
## 254 5510 G 3
## 255 4346 H 3
## 256 6372 G 3
## 257 5193 H 3
## 258 5662 H 3
## 259 5333 F 3
## 260 6041 G 3
## 261 5815 H 3
## 262 8611 F 3
## 263 6905 F 3
## 264 6905 G 3
## 265 6416 H 3
## 266 6051 H 3
## 267 8715 E 3
## 268 6988 E 3
## 269 6988 F 3
## 270 6495 G 3
## 271 7358 H 3
## 272 6572 F 3
## 273 7072 G 3
## 274 8359 F 3
## 275 6805 F 3
## 276 7711 G 3
## 277 5835 H 3
## 278 13775 D 3
## 279 14051 E 3
## 280 11419 E 3
## 281 10588 E 3
## 282 11696 F 3
## 283 10588 F 3
## 284 10450 G 3
## 285 9896 G 3
## 286 9203 G 3
## 287 9480 H 3
## 288 9065 H 3
## 289 8788 H 3
## 290 8788 I 3
## 291 8372 I 3
## 292 8095 I 3
## 293 7818 I 3
## 294 13909 D 3
## 295 11531 E 3
## 296 10692 E 3
## 297 11811 F 3
## 298 10272 F 3
## 299 9993 G 3
## 300 9293 G 3
## 301 9433 H 3
## 302 9153 H 3
## 303 8873 I 3
## 304 8175 I 3
## 305 10796 F 3
## 306 9890 H 3
## 307 8959 H 3
## 308 9107 I 3
Before explaining more in detail, considering results from G*Power, and extending the Null Hypothesis Significance Test (NHST) to two alternatives, I want to outline some key portions from the Recipes for Design of Experiments.
The original dataset for the diamonds case included several factors and response variables. However, the factors were chosen based primarily on the rules of ANOVA and block design, and personal preference with Best Guess in regards to predicting the effect. I needed to have independent variables (IV) that were categorical and dependent variables (DV) that were continuous.
The block design was placed onto one of the pair of the categorical IVs. Based on Best Guess, I blocked the certification because typically the pricing of a diamond depends more on the weight and color, than certification.
#Check the structure of the organized data
str(data)
## 'data.frame': 308 obs. of 3 variables:
## $ price : int 1302 1510 1510 1260 1641 1555 1427 1427 1126 1126 ...
## $ treatment: Factor w/ 6 levels "D","E","F","G",..: 1 2 4 4 1 2 3 4 5 6 ...
## $ block : int 1 1 1 1 1 1 1 1 1 1 ...
data$block = factor(data$block)
data$block
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [246] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [281] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3
In this experiment, the null hypothesis states that there is no difference between the color of the diamond (D,E,F,G,H,I) in regards to the price of the diamonds.
On the contrary, the alternative hypothesis states that there is a difference between the color of dimaond and the price of the diamonds.
For the purpose of this analyis, I chose a: Type I Error (alpha) of 0.05 because it is the most commonly used in practice for significance. Type II error (beta) of 0.1 because from the discussions in class, we learned that a power (1-beta) of 0.8 or greater is preferred. An increase in power means that there needs to be more sample size as well.
The rationale for this design was limited as I adapted a pre-existing experiment design dataset. The original factors and procedures to collect the data were done by the experimentor. Therefore, I had less flexibility and control over some aspects
However, the rationale for the block design was purely a simple case of a blocked design and study the results by performing ANOVA for the particular factor.
First of all, I assume the data collected was random because the diamond stores were chosen randomly and at a random time frame of the year 2000, including day and time.
Also, to ensure further randomization, I randomized the order of the data:
#Randomize the data
index = sample(1:nrow(diamond), 308, replace=FALSE)
rdata = diamond[index,]
rdata
## price color certification
## 48 3701 F GIA
## 281 10588 E HRD
## 80 6682 I GIA
## 114 7156 E GIA
## 222 5326 G IGI
## 142 9666 F GIA
## 151 9646 H GIA
## 113 4832 I GIA
## 276 7711 G HRD
## 139 8455 I GIA
## 301 9433 H HRD
## 160 967 F IGI
## 94 5122 H GIA
## 1 1302 D GIA
## 187 1595 F IGI
## 63 4759 G GIA
## 225 5937 G IGI
## 231 3432 G HRD
## 167 880 G IGI
## 42 3205 G GIA
## 302 9153 H HRD
## 209 2651 F IGI
## 172 1057 F IGI
## 260 6041 G HRD
## 292 8095 I HRD
## 271 7358 H HRD
## 81 3501 E GIA
## 152 823 F IGI
## 107 7368 D GIA
## 144 10900 E GIA
## 185 1260 F IGI
## 197 1628 E IGI
## 133 9853 E GIA
## 149 9302 H GIA
## 120 13913 F GIA
## 228 8873 H IGI
## 41 3567 F GIA
## 236 3701 F HRD
## 150 8945 I GIA
## 305 10796 F HRD
## 135 9573 F GIA
## 307 8959 H HRD
## 131 16008 D GIA
## 304 8175 I HRD
## 143 10090 G GIA
## 123 9480 F GIA
## 166 880 D IGI
## 204 1821 F IGI
## 294 13909 D HRD
## 46 3418 F GIA
## 99 5662 H GIA
## 16 1551 F GIA
## 148 9743 G GIA
## 262 8611 F HRD
## 208 1506 I IGI
## 191 1238 I IGI
## 6 1555 E GIA
## 219 4209 F IGI
## 193 1459 F IGI
## 65 5510 F GIA
## 134 10272 F GIA
## 43 3490 D GIA
## 272 6572 F HRD
## 214 3095 I IGI
## 61 5845 E GIA
## 53 4138 E GIA
## 289 8788 H HRD
## 252 6285 E HRD
## 47 3921 D GIA
## 20 1222 H GIA
## 207 1616 I IGI
## 239 3202 F HRD
## 49 3480 F GIA
## 170 919 D IGI
## 175 1248 E IGI
## 211 3652 G IGI
## 26 1485 H GIA
## 183 1539 F IGI
## 162 800 F IGI
## 217 3470 G IGI
## 110 9885 D GIA
## 128 8788 H GIA
## 256 6372 G HRD
## 74 5441 I GIA
## 161 863 F IGI
## 186 1121 I IGI
## 24 1255 H GIA
## 50 3407 G GIA
## 59 3773 G GIA
## 44 3635 E GIA
## 177 995 G IGI
## 287 9480 H HRD
## 51 3767 E GIA
## 169 638 G IGI
## 105 5815 G GIA
## 3 1510 G GIA
## 178 1108 H IGI
## 83 3851 F GIA
## 297 11811 F HRD
## 240 3256 F HRD
## 245 3925 H HRD
## 182 1082 I IGI
## 92 5122 F GIA
## 251 6867 E HRD
## 103 4727 I GIA
## 159 1050 E IGI
## 188 1233 H IGI
## 52 4066 F GIA
## 299 9993 G HRD
## 176 1147 F IGI
## 71 5193 F GIA
## 196 1299 I IGI
## 101 5030 H GIA
## 19 1316 H GIA
## 274 8359 F HRD
## 249 3643 I HRD
## 70 5586 F GIA
## 153 765 F IGI
## 86 3415 H GIA
## 288 9065 H HRD
## 33 1572 I GIA
## 205 1540 G IGI
## 200 1462 H IGI
## 104 4221 I GIA
## 298 10272 F HRD
## 224 6095 F IGI
## 95 6372 D GIA
## 227 9713 G IGI
## 221 5607 G IGI
## 98 5961 E GIA
## 112 5386 H GIA
## 31 1956 F GIA
## 132 10692 E GIA
## 263 6905 F HRD
## 244 3421 G HRD
## 79 6709 H GIA
## 215 3706 F IGI
## 67 5122 H GIA
## 229 8175 I IGI
## 243 3925 G HRD
## 181 1149 H IGI
## 34 2942 E GIA
## 12 1202 G GIA
## 32 1747 H GIA
## 39 3293 F GIA
## 155 803 G IGI
## 158 967 D IGI
## 27 1420 F GIA
## 122 10713 F GIA
## 189 1199 I IGI
## 118 10588 E GIA
## 195 1218 H IGI
## 78 6882 H GIA
## 198 1628 F IGI
## 147 8781 I GIA
## 282 11696 F HRD
## 259 5333 F HRD
## 174 1198 E IGI
## 111 6919 F GIA
## 125 9619 G GIA
## 266 6051 H HRD
## 23 1447 G GIA
## 199 1337 I IGI
## 201 1503 H IGI
## 68 3861 I GIA
## 156 705 G IGI
## 87 4291 H GIA
## 277 5835 H HRD
## 141 10372 F GIA
## 88 6512 E GIA
## 194 1459 G IGI
## 38 3501 F GIA
## 165 758 H IGI
## 25 1635 F GIA
## 137 8873 H GIA
## 58 3651 G GIA
## 126 9169 G GIA
## 206 2276 G IGI
## 72 5193 H GIA
## 202 1773 F IGI
## 37 3501 E GIA
## 223 6160 D IGI
## 164 800 G IGI
## 100 5738 E GIA
## 108 4667 I GIA
## 308 9107 I HRD
## 296 10692 E HRD
## 300 9293 G HRD
## 220 3821 G IGI
## 116 15582 D GIA
## 233 3346 E HRD
## 238 3667 F HRD
## 241 3415 H HRD
## 64 4300 H GIA
## 279 14051 E HRD
## 234 3130 H HRD
## 230 3778 F HRD
## 8 1427 G GIA
## 168 705 G IGI
## 29 1911 F GIA
## 173 919 G IGI
## 212 3722 E IGI
## 76 5705 H GIA
## 62 4401 G GIA
## 69 5881 F GIA
## 121 10588 F GIA
## 91 5122 F GIA
## 216 4070 E IGI
## 273 7072 G HRD
## 130 7818 I GIA
## 10 1126 I GIA
## 242 3792 H HRD
## 280 11419 E HRD
## 4 1260 G GIA
## 30 1525 H GIA
## 75 4948 I GIA
## 93 5122 G GIA
## 11 1468 F GIA
## 210 2383 F IGI
## 303 8873 I HRD
## 213 3722 F IGI
## 60 4291 F GIA
## 269 6988 F HRD
## 261 5815 H HRD
## 97 5193 H GIA
## 278 13775 D HRD
## 192 1580 E IGI
## 284 10450 G HRD
## 124 9896 G GIA
## 190 1471 G IGI
## 270 6495 G HRD
## 283 10588 F HRD
## 264 6905 G HRD
## 291 8372 I HRD
## 2 1510 E GIA
## 56 3667 F GIA
## 9 1126 H GIA
## 203 1636 F IGI
## 15 1693 E GIA
## 180 1283 G IGI
## 73 5263 F GIA
## 250 4300 H HRD
## 35 2532 G GIA
## 184 1365 F IGI
## 235 3995 F HRD
## 140 7895 I GIA
## 7 1427 F GIA
## 275 6805 F HRD
## 257 5193 H HRD
## 77 6805 F GIA
## 84 3605 H GIA
## 17 1410 G GIA
## 21 1738 E GIA
## 237 3529 G HRD
## 57 2892 I GIA
## 163 842 G IGI
## 102 5030 H GIA
## 267 8715 E HRD
## 14 1098 I GIA
## 246 3616 H HRD
## 218 4831 E IGI
## 255 4346 H HRD
## 157 725 H IGI
## 89 5800 E GIA
## 45 3635 F GIA
## 290 8788 I HRD
## 293 7818 I HRD
## 226 9342 H IGI
## 171 1149 E IGI
## 265 6416 H HRD
## 40 3016 G GIA
## 254 5510 G HRD
## 36 3501 E GIA
## 82 3432 G GIA
## 119 9757 E GIA
## 85 3900 E GIA
## 109 4355 I GIA
## 127 9203 G GIA
## 258 5662 H HRD
## 55 3529 G GIA
## 90 6285 F GIA
## 138 8873 I GIA
## 115 7680 F GIA
## 179 1485 F IGI
## 18 1269 G GIA
## 66 5122 G GIA
## 145 10571 F GIA
## 154 803 G IGI
## 253 5800 G HRD
## 286 9203 G HRD
## 13 1327 E GIA
## 248 3785 H HRD
## 106 4585 H GIA
## 5 1641 D GIA
## 117 11419 D GIA
## 146 9563 I GIA
## 136 9153 H GIA
## 129 8095 I GIA
## 295 11531 E HRD
## 22 1593 F GIA
## 96 5881 E GIA
## 268 6988 E HRD
## 285 9896 G HRD
## 247 3615 I HRD
## 28 1420 H GIA
## 54 3605 F GIA
## 232 3851 F HRD
## 306 9890 H HRD
Therefore, there were 3 levels of randomization: measurement, assignment, and performing.
Due to the simplification of the dataset, which was a cross-section during 2000, I cannot verify if there were any replicates and/or repeated measures.
As mentioned previously, the design definitely includes blocking. The IV factor (certification) was blocked to reduce the nuisance factor in how the certification process may vary among the institutions due to differing perspectives and industry standards, while color of a diamond can be viewed more or less with less conflicts and consensus. An ANOVA test was examined, with the corresponding results for certification neglected.
Below is the descriptive summary of the data.
#Display the descriptive statistics
summary(diamond)
## price color certification
## Min. : 638 D:16 GIA:151
## 1st Qu.: 1625 E:44 HRD: 79
## Median : 4215 F:82 IGI: 78
## Mean : 5019 G:65
## 3rd Qu.: 7446 H:61
## Max. :16008 I:40
Also, to take a closer look at the general overview of the data, I plotted a histogram of the price of diamonds. The histogram seems to have a right-skew and mostly clustered within the approximate range of 0 and 5000.
#Histogram and Boxplot of Price
hist(diamond$price, main="Histogram of Diamond Prices")
boxplot(diamond$price, main="Boxplot of Diamond Prices")
You can see the different certifications labelled as G, I, and H.
#Plot for Certification
plot(diamond$price, pch=as.character(diamond$certification))
From this graphic, the IGI certification seemed to be not have as many expensive diamonds as the other certifications.
boxplot(diamond$price~diamond$certification, main = "Price vs Certification", xlab = "Price in Singapore $",ylab="Certification")
The me was calculated in a similar fashion as the previous project. Here, I considered the non-blocking IV, which was the color.
#Calculating the main effect to get the effect size (later confirm with G*Power)
#Take the difference between the means of the non-blocking IV (color)
m1 <- mean(subset(diamond$price, diamond$color=="D"))
m2 <- mean(subset(diamond$price, diamond$color=="E"))
m3 <- mean(subset(diamond$price, diamond$color=="F"))
m4 <- mean(subset(diamond$price, diamond$color=="G"))
m5 <- mean(subset(diamond$price, diamond$color=="H"))
m6 <- mean(subset(diamond$price, diamond$color=="I"))
me <- m6 - m5 - m4 - m3 - m2 - m1
#Print the result
me
## [1] -21610.39
me = -21610.39
There was a difference between the means. A boxplot below was plotted to elaborate on the me.
#Boxplots
boxplot(diamond$price~diamond$color, main = "Price vs Color", xlab = "Price in Singapore $",ylab="Color")
First of all, from the boxplot, there were no outliers present. Also, the median for D was higher than most of the other colors, followed by H,I,E,F, and then G. F and G were fairly close, but the variance or range differed.
I first found the effect size by hand using R, and then later confirmed it with G*Power. The equation to compute the effect size was straightforward because it was taking the differences of the averages and divided by the standard deviation.
m1 <- mean(diamond$color=="D")
m2 <- mean(diamond$color=="E")
m3 <- mean(diamond$color=="F")
m4 <- mean(diamond$color=="G")
m5 <- mean(diamond$color=="H")
m6 <- mean(diamond$color=="I")
diff_mean = m6 - m5 - m4 - m3 -m2 -m1
sd1 <- sd(diamond$color=="D")
sd2 <- sd(diamond$color=="E")
sd3 <- sd(diamond$color=="F")
sd4 <- sd(diamond$color=="G")
sd5 <- sd(diamond$color=="H")
sd6 <- sd(diamond$color=="I")
sd = (sd1+sd2+sd3+sd4+sd5+sd6)/2
result = abs(diff_mean/sd)
result
## [1] 0.6853988
As mentioned before, Type I Error (alpha) = 0.05 Type II error (beta) = 0.1.
Effect size = 0.6853988
On G*Power, I kept the other conditions the same, except the ‘Type of power analysis’ to A Priori.
The following parameters were added into G*Power:
Effect Size = 0.6853988 Type I Error (alpha) = 0.05 Power = 0.9 Number of Groups = 3
Below is the screenshot:
Effect Size
From the G*Power output, I found that the critical F-statistics was 3.0253 and the sample size N needed to be 309. N=309 would be the amount needed to achieve the desired power.
Now that I got the required sample size needed, I decided to randomly select from the data.
#Set the right sample size
N = 33
set.seed(8)
#rdiamond = new diamond data
rdiamond = diamond[sample(nrow(diamond),N),]
First of all, I wanted to show the ANOVA result that was before the selected N experimental cases at random.
Remember: treatment: color block: certification (3 groups)
#Randomized Complete Block Design with ANOVA Analysis
fit.lm = lm(price ~ treatment+block, data=data)
anova(fit.lm)
## Analysis of Variance Table
##
## Response: price
## Df Sum Sq Mean Sq F value Pr(>F)
## treatment 5 108229779 21645956 2.638 0.02364 *
## block 2 985593642 492796821 60.058 < 2e-16 ***
## Residuals 300 2461603926 8205346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit.lm)
##
## Call:
## lm(formula = price ~ treatment + block, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6286.7 -1604.7 -557.2 1675.7 8765.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7588.7 727.3 10.435 < 2e-16 ***
## treatmentE -1936.6 837.2 -2.313 0.021384 *
## treatmentF -2441.0 784.4 -3.112 0.002037 **
## treatmentG -2689.6 802.9 -3.350 0.000912 ***
## treatmentH -2873.3 809.5 -3.550 0.000448 ***
## treatmentI -2129.4 848.3 -2.510 0.012589 *
## block2 -2978.4 402.9 -7.392 1.45e-12 ***
## block3 2045.9 401.0 5.102 5.98e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2864 on 300 degrees of freedom
## Multiple R-squared: 0.3076, Adjusted R-squared: 0.2915
## F-statistic: 19.04 on 7 and 300 DF, p-value: < 2.2e-16
#Plots
plot(fit.lm)
interaction.plot(data$treatment, data$block, data$price)
Using the sample size, N=33, I used the ndiamond. The 2-way ANOVA test was then performed. Since the certification was blocked, I ignored the F statistics.
#ANOVA
anova = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 106615419 21323084 2.866 0.0352 *
## rdiamond$certification 2 52942085 26471043 3.558 0.0437 *
## Residuals 25 185998031 7439921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The null hypothesis was that there is no difference between the color of the diamond in regards to the price of the diamonds. The results from the certification would be ignored because it was the blocking variable. From the ANOVA results, the F-statistic for the treatment (color) was 2.866 and the p-value was 0.0352. The alpha value was set to 0.05. The p-value was much less than the alpha value and the F value was not close to 0.
Therefore, since the p-value < alpha, we reject the null hypothesis, and the result was statistically significant. The certification effect was not the research question, so it was neglected.
Null Hypothesis Statistical Testing (NHST) was explored to practice the skills and apply them to the experiment.
Multiple models are used to minimize the importance of the need to reject the null hypothesis. It specifies how well the data fits the set of models. The data are not to be assumed that the they would be derived directly from the null hypothesis. In this case where the outcome was significant, I want to find a family of model that shows the opposite outcome, so insignificant. Since I had significance, I want to find insignificance. The purpose of multiple models is to show that the results are truly significant or insignificant (depending on your results) by showing a model or family of models for which I would not get the same significant results.
From the original model, I found that the results were significant.
I first looked at the interactions.
#Check Interaction
mm1 = aov(rdiamond$price ~ rdiamond$color*rdiamond$certification)
summary(mm1)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 106615419 21323084 2.590 0.0580
## rdiamond$certification 2 52942085 26471043 3.215 0.0615
## rdiamond$color:rdiamond$certification 5 21341985 4268397 0.518 0.7593
## Residuals 20 164656046 8232802
##
## rdiamond$color .
## rdiamond$certification .
## rdiamond$color:rdiamond$certification
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The interactions showed some significance, but the interaction was fairly insignificant. This strengthens the assumption that there was no interaction between the blocking and response variable.
Then, I looked at the case, where I transformed the outcome of the Y by increasing the power to 2.
rdiamond$price = (rdiamond$price)^2
mm2= aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm2)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 1.219e+16 2.438e+15 2.117 0.0967 .
## rdiamond$certification 2 4.161e+15 2.081e+15 1.807 0.1850
## Residuals 25 2.879e+16 1.152e+15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Here, I noticed that the results were insignificant.
I tried more cases. For this, I multiplied the coefficient 4.
rdiamond$price = (rdiamond$price)*4
mm3 = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm3)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 1.950e+17 3.900e+16 2.117 0.0967 .
## rdiamond$certification 2 6.658e+16 3.329e+16 1.807 0.1850
## Residuals 25 4.607e+17 1.843e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The results were insignificant.
I also looked into log.
N = 33
mdiamond = diamond[sample(nrow(diamond),N),]
mdiamond$price = log(mdiamond$price)
mm4 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification)
summary(mm4)
## Df Sum Sq Mean Sq F value Pr(>F)
## mdiamond$color 4 1.359 0.3398 0.740 0.5735
## mdiamond$certification 2 3.714 1.8572 4.043 0.0296 *
## Residuals 26 11.944 0.4594
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interestingly, the results were insignificant, while the certification showed significance.
Next, I looked into squaring the certification.
mdiamond$certification2 = as.integer(mdiamond$certification)^2
mm5 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification2)
summary(mm5)
## Df Sum Sq Mean Sq F value Pr(>F)
## mdiamond$color 4 1.359 0.3398 0.586 0.675
## mdiamond$certification2 1 0.003 0.0030 0.005 0.943
## Residuals 27 15.655 0.5798
This made the results insignificant as well.
Finally, I considered other fixed effect models, where I added a new factor, clarity.
test = read.csv("nmodel.csv", header = TRUE, sep=",")
N = 33
nm = test[sample(nrow(test),N),]
mm6 = aov(test$price ~ test$clarity+test$certification)
summary(mm6)
## Df Sum Sq Mean Sq F value Pr(>F)
## test$clarity 4 2.997e+08 74917212 8.826 9.46e-07 ***
## test$certification 2 7.009e+08 350460667 41.290 < 2e-16 ***
## Residuals 301 2.555e+09 8487831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This time, the results were significant. The clarity seemed to have affected the price of diamond a lot. From an intuitive point of view, this makes sense.
Therefore, with multiple models I was able to show different family of models that predicted no difference, positive difference, and negative difference. In other words, I was able to show that other models can make the results insignificant, contrast to my significance.
The second NHST I considered was confidence intervals. Confidence intervals are a medium or tool to assess the probability that a parameter lies within a certain range of values. It is used to appraise results found. To interpret confidence intervals, I would use to see where the values fall under this particular range of values. It estimates the degree to which the values is most likely going to be the true value. I prefer confidence intervals over effect size because it can replace significance testing.
First, I used confint to see the confidence interval.
confint(anova)
## 2.5 % 97.5 %
## (Intercept) -4315.6425 6919.6425
## rdiamond$colorE 750.8011 12825.7181
## rdiamond$colorF -2575.6021 9585.2104
## rdiamond$colorG -2337.4214 10092.1483
## rdiamond$colorH -1273.2912 10817.0493
## rdiamond$colorI -1135.7277 13079.3409
## rdiamond$certificationHRD -2768.7008 2674.0127
## rdiamond$certificationIGI -6608.6126 -798.3339
Then, I used Tukey’s Method. Tukey’s Method creates confidence intervals for paired difference among factor levels and helps control the error rate, which I specified as 0.05.
tukey = TukeyHSD(anova)
plot(tukey)
TukeyHSD(anova)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = rdiamond$price ~ rdiamond$color + rdiamond$certification)
##
## $`rdiamond$color`
## diff lwr upr p adj
## E-D 6252.4286 -2733.905 15238.7617 0.2981618
## F-D 1853.5556 -7007.088 10714.1995 0.9862680
## G-D 3127.2000 -6081.051 12335.4513 0.8973377
## H-D 4754.1250 -4161.726 13669.9760 0.5793071
## I-D 2268.3333 -7438.016 11974.6825 0.9776018
## F-E -4398.8730 -8635.071 -162.6749 0.0385353
## G-E -3125.2286 -8047.246 1796.7888 0.3935379
## H-E -1498.3036 -5848.793 2852.1863 0.8918532
## I-E -3984.0952 -9784.748 1816.5579 0.3111946
## G-F 1273.6444 -3414.968 5962.2565 0.9575113
## H-F 2900.5694 -1183.986 6985.1245 0.2779638
## I-F 414.7778 -5189.186 6018.7411 0.9999037
## H-G 1626.9250 -3165.201 6419.0509 0.8974604
## I-G -858.8667 -6997.701 5279.9676 0.9978711
## I-H -2485.7917 -8176.643 3205.0600 0.7571489
##
## $`rdiamond$certification`
## diff lwr upr p adj
## HRD-GIA -194.6715 -3397.414 3008.0710 0.987449
## IGI-GIA -2237.9506 -5011.607 535.7057 0.130649
## IGI-HRD -2043.2791 -5624.054 1537.4958 0.345518
The confidence interval for the certification contained zero. A zero in a confidence interval range means that there is no difference. In other words, the confidence interval that contained zero means that there is no statistical significance. For my case, there were inconsistencies and consistencies depending on the family model I looked with my ANOVA tests because certification was not significant in some.
I estimated the estimation of the parameters for the N=33.
fit.lm2 = lm(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(fit.lm2)
##
## Call:
## lm(formula = rdiamond$price ~ rdiamond$color + rdiamond$certification)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233227043 -87657323 -5532014 65925768 237594233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6780816 135746198 0.050 0.9606
## rdiamond$colorE 275474231 145890743 1.888 0.0707 .
## rdiamond$colorF 116416918 146928544 0.792 0.4356
## rdiamond$colorG 135329147 150175704 0.901 0.3761
## rdiamond$colorH 174864663 146077092 1.197 0.2425
## rdiamond$colorI 219826615 171748338 1.280 0.2123
## rdiamond$certificationHRD -16358766 65759585 -0.249 0.8056
## rdiamond$certificationIGI -133199705 70200556 -1.897 0.0694 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 135700000 on 25 degrees of freedom
## Multiple R-squared: 0.3622, Adjusted R-squared: 0.1836
## F-statistic: 2.028 on 7 and 25 DF, p-value: 0.0913
Note: [Some outputs were deleted.]
coef(anova)
## (Intercept) rdiamond$colorE
## 1302.00000 6788.25961
## rdiamond$colorF rdiamond$colorG
## 3504.80412 3877.36346
## rdiamond$colorH rdiamond$colorI
## 4771.87901 5971.80659
## rdiamond$certificationHRD rdiamond$certificationIGI
## -47.34404 -3703.47326
To check if the normality assumption holds, I used the qqnorm and qqlines.
#Check for Normality
qqnorm(residuals(fit.lm2))
qqline(residuals(fit.lm2))
#Residual Plot
plot(fitted(anova),residuals(anova))
Based on the qqnorm and qqline, the qqnorm line looks fairly linear and closely fits the qqline drawn. Also, the residual plot showed an evenly distributed residual as a function of fitted values. Also, there was a good scatter and not close to 0. The pattern seemed random and evenly distributed, two important conditions that satisfy the normality check. Therefore, the normality assumption on the IV was satisfied because the residuals were evenly distributed and the qqnorm line is linear enough. The normality check passed. There is no need to do transformation because the normality assumption was passed.
Summary: Overall, the experiment finding was an interesting one. Based on the p-values, we learned that the color of the diamond does affect the price. The study focused on a blocked design of certification. Therefore, we rejected the null hypothesis. There was a difference between the color of the diamond in regards to the price of the diamonds. However, by exploring different family of models through multiple models, we were able to see changes in significance. In general, the color of the diamond seemed to have an effect on the price of the diamond.
In terms of the model, I would say this was a mixed effects model because there was randomization from the data collection and the variables were selected. However, some might argue it was a fixed effects model because the data might not have represented the general population.
For the findings, it was interesting to explore the multiple models.
The analysis made sense and the block design was well executed.
http://www.stat.purdue.edu/~xuanyaoh/stat350/xyMar26Lec23.pdf
http://people.stat.sfu.ca/~thompson/stat403-650/randomized_block.html
http://www.r-tutor.com/elementary-statistics/analysis-variance/randomized-block-design
https://vincentarelbundock.github.io/Rdatasets/doc/Ecdat/Diamond.html
https://statistics.laerd.com/spss-tutorials/multiple-regression-using-spss-statistics.php
#Benjamin Byeon
#10/30/16
#Section 1
#Load the data
diamond = read.csv("diamonds.csv", header = TRUE, sep=",")
diamond
#System under test
head(diamond)
tail(diamond)
names(diamond)
#Display the structure
str(diamond)
#Check the levels
levels(diamond$price)
levels(diamond$color)
levels(diamond$certification)
#Randomize the data
index = sample(1:nrow(diamond), 308, replace=FALSE)
rdata = diamond[index,]
rdata
#Display the descriptive statistics
summary(diamond)
#Histogram
hist(diamond$price, main="Histogram of Diamond Prices")
#Plot for certification
plot(diamond$price, pch=as.character(diamond$certification))
#Boxplots
boxplot(diamond$price~diamond$color, main = "Price vs Color", xlab = "Price in Singapore $",ylab="Color")
boxplot(diamond$price~diamond$certification, main = "Price vs Certification", xlab = "Price in Singapore $",ylab="Certification")
#Calculating the main effect to get the effect size (rather confirm with G*Power)
#Take the difference between the means of the non-blocking IV (color)
m1 <- mean(subset(diamond$price, diamond$color=="D"))
m2 <- mean(subset(diamond$price, diamond$color=="E"))
m3 <- mean(subset(diamond$price, diamond$color=="F"))
m4 <- mean(subset(diamond$price, diamond$color=="G"))
m5 <- mean(subset(diamond$price, diamond$color=="H"))
m6 <- mean(subset(diamond$price, diamond$color=="I"))
me <- m6 - m5 - m4 - m3 - m2 - m1
#Print the result
me
#Effect Size
m1 <- mean(diamond$color=="D")
m2 <- mean(diamond$color=="E")
m3 <- mean(diamond$color=="F")
m4 <- mean(diamond$color=="G")
m5 <- mean(diamond$color=="H")
m6 <- mean(diamond$color=="I")
diff_mean = m6 - m5 - m4 - m3 -m2 -m1
sd1 <- sd(diamond$color=="D")
sd2 <- sd(diamond$color=="E")
sd3 <- sd(diamond$color=="F")
sd4 <- sd(diamond$color=="G")
sd5 <- sd(diamond$color=="H")
sd6 <- sd(diamond$color=="I")
sd = (sd1+sd2+sd3+sd4+sd5+sd6)/2
result = abs(diff_mean/sd)
result
#Organize the data to treatments and block
data = read.csv("block.csv", header = TRUE, sep=",")
data
#Check the structure of the organized data
str(data)
data$block = factor(data$block)
data$block
#Randomized Complete Block Design with ANOVA Analysis
fit.lm = lm(price ~ treatment+block, data=data)
anova(fit.lm)
#Plots
plot(fit.lm)
interaction.plot(data$treatment, data$block, data$price)
#Set the right sample size
N = 33
set.seed(8)
#rdiamond = new diamond data
rdiamond = diamond[sample(nrow(diamond),N),]
#ANOVA
anova = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(anova)
#Multiple models
#Check Interaction
mm1 = aov(rdiamond$price ~ rdiamond$color*rdiamond$certification)
summary(mm1)
#Insignificant - original was significant
rdiamond$price = (rdiamond$price)^2
mm2= aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm2)
#Insignificant
rdiamond$price = (rdiamond$price)*4
mm3 = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm3)
#Insignificant
N = 33
mdiamond = diamond[sample(nrow(diamond),N),]
mdiamond$price = log(mdiamond$price)
mm4 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification)
summary(mm4)
#Insignificant
mdiamond$certification2 = as.integer(mdiamond$certification)^2
mm5 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification2)
summary(mm5)
#Other fixed effect model - Significant
test = read.csv("nmodel.csv", header = TRUE, sep=",")
N = 33
nm = test[sample(nrow(test),N),]
mm6 = aov(test$price ~ test$clarity+test$certification)
summary(mm6)
#Confidence Interval
confint(anova1)
tukey = TukeyHSD(anova1)
plot(tukey)
TukeyHSD(anova)
#Estimation
fit.lm2 = lm(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(fit.lm2)
coef(anova)
#Check for Normality
qqnorm(residuals(fit.lm2))
qqline(residuals(fit.lm2))
#Residual Plot
plot(fitted(anova),residuals(anova))