From the Ecdat Package, I selected a dataset that contained N=308 observations. The data looks into different factors that affect the pricing of the diamond carats. However, the study selected a pair of categorical independent variables and a continuous variable dependent variable. Also, the data was organized so that one independent variable was blocked. Further analysis were made regarding the study.
The data was copied and pasted into Excel, and then saved as a .csv. Shown below is the loaded data into R.
#Load the data
diamond = read.csv("diamonds.csv", header = TRUE, sep=",")
diamond
## price color certification
## 1 1302 D GIA
## 2 1510 E GIA
## 3 1510 G GIA
## 4 1260 G GIA
## 5 1641 D GIA
## 6 1555 E GIA
## 7 1427 F GIA
## 8 1427 G GIA
## 9 1126 H GIA
## 10 1126 I GIA
## 11 1468 F GIA
## 12 1202 G GIA
## 13 1327 E GIA
## 14 1098 I GIA
## 15 1693 E GIA
## 16 1551 F GIA
## 17 1410 G GIA
## 18 1269 G GIA
## 19 1316 H GIA
## 20 1222 H GIA
## 21 1738 E GIA
## 22 1593 F GIA
## 23 1447 G GIA
## 24 1255 H GIA
## 25 1635 F GIA
## 26 1485 H GIA
## 27 1420 F GIA
## 28 1420 H GIA
## 29 1911 F GIA
## 30 1525 H GIA
## 31 1956 F GIA
## 32 1747 H GIA
## 33 1572 I GIA
## 34 2942 E GIA
## 35 2532 G GIA
## 36 3501 E GIA
## 37 3501 E GIA
## 38 3501 F GIA
## 39 3293 F GIA
## 40 3016 G GIA
## 41 3567 F GIA
## 42 3205 G GIA
## 43 3490 D GIA
## 44 3635 E GIA
## 45 3635 F GIA
## 46 3418 F GIA
## 47 3921 D GIA
## 48 3701 F GIA
## 49 3480 F GIA
## 50 3407 G GIA
## 51 3767 E GIA
## 52 4066 F GIA
## 53 4138 E GIA
## 54 3605 F GIA
## 55 3529 G GIA
## 56 3667 F GIA
## 57 2892 I GIA
## 58 3651 G GIA
## 59 3773 G GIA
## 60 4291 F GIA
## 61 5845 E GIA
## 62 4401 G GIA
## 63 4759 G GIA
## 64 4300 H GIA
## 65 5510 F GIA
## 66 5122 G GIA
## 67 5122 H GIA
## 68 3861 I GIA
## 69 5881 F GIA
## 70 5586 F GIA
## 71 5193 F GIA
## 72 5193 H GIA
## 73 5263 F GIA
## 74 5441 I GIA
## 75 4948 I GIA
## 76 5705 H GIA
## 77 6805 F GIA
## 78 6882 H GIA
## 79 6709 H GIA
## 80 6682 I GIA
## 81 3501 E GIA
## 82 3432 G GIA
## 83 3851 F GIA
## 84 3605 H GIA
## 85 3900 E GIA
## 86 3415 H GIA
## 87 4291 H GIA
## 88 6512 E GIA
## 89 5800 E GIA
## 90 6285 F GIA
## 91 5122 F GIA
## 92 5122 F GIA
## 93 5122 G GIA
## 94 5122 H GIA
## 95 6372 D GIA
## 96 5881 E GIA
## 97 5193 H GIA
## 98 5961 E GIA
## 99 5662 H GIA
## 100 5738 E GIA
## 101 5030 H GIA
## 102 5030 H GIA
## 103 4727 I GIA
## 104 4221 I GIA
## 105 5815 G GIA
## 106 4585 H GIA
## 107 7368 D GIA
## 108 4667 I GIA
## 109 4355 I GIA
## 110 9885 D GIA
## 111 6919 F GIA
## 112 5386 H GIA
## 113 4832 I GIA
## 114 7156 E GIA
## 115 7680 F GIA
## 116 15582 D GIA
## 117 11419 D GIA
## 118 10588 E GIA
## 119 9757 E GIA
## 120 13913 F GIA
## 121 10588 F GIA
## 122 10713 F GIA
## 123 9480 F GIA
## 124 9896 G GIA
## 125 9619 G GIA
## 126 9169 G GIA
## 127 9203 G GIA
## 128 8788 H GIA
## 129 8095 I GIA
## 130 7818 I GIA
## 131 16008 D GIA
## 132 10692 E GIA
## 133 9853 E GIA
## 134 10272 F GIA
## 135 9573 F GIA
## 136 9153 H GIA
## 137 8873 H GIA
## 138 8873 I GIA
## 139 8455 I GIA
## 140 7895 I GIA
## 141 10372 F GIA
## 142 9666 F GIA
## 143 10090 G GIA
## 144 10900 E GIA
## 145 10571 F GIA
## 146 9563 I GIA
## 147 8781 I GIA
## 148 9743 G GIA
## 149 9302 H GIA
## 150 8945 I GIA
## 151 9646 H GIA
## 152 823 F IGI
## 153 765 F IGI
## 154 803 G IGI
## 155 803 G IGI
## 156 705 G IGI
## 157 725 H IGI
## 158 967 D IGI
## 159 1050 E IGI
## 160 967 F IGI
## 161 863 F IGI
## 162 800 F IGI
## 163 842 G IGI
## 164 800 G IGI
## 165 758 H IGI
## 166 880 D IGI
## 167 880 G IGI
## 168 705 G IGI
## 169 638 G IGI
## 170 919 D IGI
## 171 1149 E IGI
## 172 1057 F IGI
## 173 919 G IGI
## 174 1198 E IGI
## 175 1248 E IGI
## 176 1147 F IGI
## 177 995 G IGI
## 178 1108 H IGI
## 179 1485 F IGI
## 180 1283 G IGI
## 181 1149 H IGI
## 182 1082 I IGI
## 183 1539 F IGI
## 184 1365 F IGI
## 185 1260 F IGI
## 186 1121 I IGI
## 187 1595 F IGI
## 188 1233 H IGI
## 189 1199 I IGI
## 190 1471 G IGI
## 191 1238 I IGI
## 192 1580 E IGI
## 193 1459 F IGI
## 194 1459 G IGI
## 195 1218 H IGI
## 196 1299 I IGI
## 197 1628 E IGI
## 198 1628 F IGI
## 199 1337 I IGI
## 200 1462 H IGI
## 201 1503 H IGI
## 202 1773 F IGI
## 203 1636 F IGI
## 204 1821 F IGI
## 205 1540 G IGI
## 206 2276 G IGI
## 207 1616 I IGI
## 208 1506 I IGI
## 209 2651 F IGI
## 210 2383 F IGI
## 211 3652 G IGI
## 212 3722 E IGI
## 213 3722 F IGI
## 214 3095 I IGI
## 215 3706 F IGI
## 216 4070 E IGI
## 217 3470 G IGI
## 218 4831 E IGI
## 219 4209 F IGI
## 220 3821 G IGI
## 221 5607 G IGI
## 222 5326 G IGI
## 223 6160 D IGI
## 224 6095 F IGI
## 225 5937 G IGI
## 226 9342 H IGI
## 227 9713 G IGI
## 228 8873 H IGI
## 229 8175 I IGI
## 230 3778 F HRD
## 231 3432 G HRD
## 232 3851 F HRD
## 233 3346 E HRD
## 234 3130 H HRD
## 235 3995 F HRD
## 236 3701 F HRD
## 237 3529 G HRD
## 238 3667 F HRD
## 239 3202 F HRD
## 240 3256 F HRD
## 241 3415 H HRD
## 242 3792 H HRD
## 243 3925 G HRD
## 244 3421 G HRD
## 245 3925 H HRD
## 246 3616 H HRD
## 247 3615 I HRD
## 248 3785 H HRD
## 249 3643 I HRD
## 250 4300 H HRD
## 251 6867 E HRD
## 252 6285 E HRD
## 253 5800 G HRD
## 254 5510 G HRD
## 255 4346 H HRD
## 256 6372 G HRD
## 257 5193 H HRD
## 258 5662 H HRD
## 259 5333 F HRD
## 260 6041 G HRD
## 261 5815 H HRD
## 262 8611 F HRD
## 263 6905 F HRD
## 264 6905 G HRD
## 265 6416 H HRD
## 266 6051 H HRD
## 267 8715 E HRD
## 268 6988 E HRD
## 269 6988 F HRD
## 270 6495 G HRD
## 271 7358 H HRD
## 272 6572 F HRD
## 273 7072 G HRD
## 274 8359 F HRD
## 275 6805 F HRD
## 276 7711 G HRD
## 277 5835 H HRD
## 278 13775 D HRD
## 279 14051 E HRD
## 280 11419 E HRD
## 281 10588 E HRD
## 282 11696 F HRD
## 283 10588 F HRD
## 284 10450 G HRD
## 285 9896 G HRD
## 286 9203 G HRD
## 287 9480 H HRD
## 288 9065 H HRD
## 289 8788 H HRD
## 290 8788 I HRD
## 291 8372 I HRD
## 292 8095 I HRD
## 293 7818 I HRD
## 294 13909 D HRD
## 295 11531 E HRD
## 296 10692 E HRD
## 297 11811 F HRD
## 298 10272 F HRD
## 299 9993 G HRD
## 300 9293 G HRD
## 301 9433 H HRD
## 302 9153 H HRD
## 303 8873 I HRD
## 304 8175 I HRD
## 305 10796 F HRD
## 306 9890 H HRD
## 307 8959 H HRD
## 308 9107 I HRD
The dataset helps determine the effects of color and certification to the pricing of the diamond. Initially, the dataset consisted of many factors. However, I chose two independent variables that were of interest based purely on my preference.
I selected the color and certification to examine their effects. In addition, I chose which indepdendent variable to block. I blocked the variable that seemed to have less of an effect on price. Therefore, I decided to block certification because I thought certification had to do more with the institution certifying the diamond and less with the value of the diamond itself, like the color of the diamon. I was interested in studying a Randomized Complete Block Design.
Below are the definitions of the variables:
price: the price of the diamond in Singapore $
color: the color of the diamond with a factor of levels (D,E,F,G,H,I)
certification: the certification body with a factor of levels (GIA,IGI,HRD)
#Display the descriptive statistics
summary(diamond)
## price color certification
## Min. : 638 D:16 GIA:151
## 1st Qu.: 1625 E:44 HRD: 79
## Median : 4215 F:82 IGI: 78
## Mean : 5019 G:65
## 3rd Qu.: 7446 H:61
## Max. :16008 I:40
#System under test
head(diamond)
## price color certification
## 1 1302 D GIA
## 2 1510 E GIA
## 3 1510 G GIA
## 4 1260 G GIA
## 5 1641 D GIA
## 6 1555 E GIA
names(diamond)
## [1] "price" "color" "certification"
The data contains 3 factors and several levels. The as.factors was not necessary because they were categorical.
#Display the structure
str(diamond)
## 'data.frame': 308 obs. of 3 variables:
## $ price : int 1302 1510 1510 1260 1641 1555 1427 1427 1126 1126 ...
## $ color : Factor w/ 6 levels "D","E","F","G",..: 1 2 4 4 1 2 3 4 5 6 ...
## $ certification: Factor w/ 3 levels "GIA","HRD","IGI": 1 1 1 1 1 1 1 1 1 1 ...
#Check the levels
levels(diamond$price)
## NULL
levels(diamond$color)
## [1] "D" "E" "F" "G" "H" "I"
levels(diamond$certification)
## [1] "GIA" "HRD" "IGI"
To summarize, the levels are:
color: D,E,F,G,H,I
certification: GIA,IGI,HRD
The data had one continuous variable that was also the dependent variable. It was the price.
The main objective of the experiment was to determine the effects of pricing on diamond. Therefore, the price variable also represents the response variable as well.
The data consists of N=308 obversations with 3 variables. The data was collected in the year 2000 and the country the data came from was Singapore.
Here are the organizations and views of the data:
summary(diamond)
## price color certification
## Min. : 638 D:16 GIA:151
## 1st Qu.: 1625 E:44 HRD: 79
## Median : 4215 F:82 IGI: 78
## Mean : 5019 G:65
## 3rd Qu.: 7446 H:61
## Max. :16008 I:40
head(diamond)
## price color certification
## 1 1302 D GIA
## 2 1510 E GIA
## 3 1510 G GIA
## 4 1260 G GIA
## 5 1641 D GIA
## 6 1555 E GIA
tail(diamond)
## price color certification
## 303 8873 I HRD
## 304 8175 I HRD
## 305 10796 F HRD
## 306 9890 H HRD
## 307 8959 H HRD
## 308 9107 I HRD
Next, let’s look at the block design.
To consider for blocking, I needed to organize the data so that I could block on one independent variable and test a hypothesis on the second independent variable. Therefore, I chose certification as the variable to block. The color was the treatment. Take a closer look at the re-organized dataset:
#Organize the data to treatments and block
data = read.csv("block.csv", header = TRUE, sep=",")
data
## price treatment block
## 1 1302 D 1
## 2 1510 E 1
## 3 1510 G 1
## 4 1260 G 1
## 5 1641 D 1
## 6 1555 E 1
## 7 1427 F 1
## 8 1427 G 1
## 9 1126 H 1
## 10 1126 I 1
## 11 1468 F 1
## 12 1202 G 1
## 13 1327 E 1
## 14 1098 I 1
## 15 1693 E 1
## 16 1551 F 1
## 17 1410 G 1
## 18 1269 G 1
## 19 1316 H 1
## 20 1222 H 1
## 21 1738 E 1
## 22 1593 F 1
## 23 1447 G 1
## 24 1255 H 1
## 25 1635 F 1
## 26 1485 H 1
## 27 1420 F 1
## 28 1420 H 1
## 29 1911 F 1
## 30 1525 H 1
## 31 1956 F 1
## 32 1747 H 1
## 33 1572 I 1
## 34 2942 E 1
## 35 2532 G 1
## 36 3501 E 1
## 37 3501 E 1
## 38 3501 F 1
## 39 3293 F 1
## 40 3016 G 1
## 41 3567 F 1
## 42 3205 G 1
## 43 3490 D 1
## 44 3635 E 1
## 45 3635 F 1
## 46 3418 F 1
## 47 3921 D 1
## 48 3701 F 1
## 49 3480 F 1
## 50 3407 G 1
## 51 3767 E 1
## 52 4066 F 1
## 53 4138 E 1
## 54 3605 F 1
## 55 3529 G 1
## 56 3667 F 1
## 57 2892 I 1
## 58 3651 G 1
## 59 3773 G 1
## 60 4291 F 1
## 61 5845 E 1
## 62 4401 G 1
## 63 4759 G 1
## 64 4300 H 1
## 65 5510 F 1
## 66 5122 G 1
## 67 5122 H 1
## 68 3861 I 1
## 69 5881 F 1
## 70 5586 F 1
## 71 5193 F 1
## 72 5193 H 1
## 73 5263 F 1
## 74 5441 I 1
## 75 4948 I 1
## 76 5705 H 1
## 77 6805 F 1
## 78 6882 H 1
## 79 6709 H 1
## 80 6682 I 1
## 81 3501 E 1
## 82 3432 G 1
## 83 3851 F 1
## 84 3605 H 1
## 85 3900 E 1
## 86 3415 H 1
## 87 4291 H 1
## 88 6512 E 1
## 89 5800 E 1
## 90 6285 F 1
## 91 5122 F 1
## 92 5122 F 1
## 93 5122 G 1
## 94 5122 H 1
## 95 6372 D 1
## 96 5881 E 1
## 97 5193 H 1
## 98 5961 E 1
## 99 5662 H 1
## 100 5738 E 1
## 101 5030 H 1
## 102 5030 H 1
## 103 4727 I 1
## 104 4221 I 1
## 105 5815 G 1
## 106 4585 H 1
## 107 7368 D 1
## 108 4667 I 1
## 109 4355 I 1
## 110 9885 D 1
## 111 6919 F 1
## 112 5386 H 1
## 113 4832 I 1
## 114 7156 E 1
## 115 7680 F 1
## 116 15582 D 1
## 117 11419 D 1
## 118 10588 E 1
## 119 9757 E 1
## 120 13913 F 1
## 121 10588 F 1
## 122 10713 F 1
## 123 9480 F 1
## 124 9896 G 1
## 125 9619 G 1
## 126 9169 G 1
## 127 9203 G 1
## 128 8788 H 1
## 129 8095 I 1
## 130 7818 I 1
## 131 16008 D 1
## 132 10692 E 1
## 133 9853 E 1
## 134 10272 F 1
## 135 9573 F 1
## 136 9153 H 1
## 137 8873 H 1
## 138 8873 I 1
## 139 8455 I 1
## 140 7895 I 1
## 141 10372 F 1
## 142 9666 F 1
## 143 10090 G 1
## 144 10900 E 1
## 145 10571 F 1
## 146 9563 I 1
## 147 8781 I 1
## 148 9743 G 1
## 149 9302 H 1
## 150 8945 I 1
## 151 9646 H 1
## 152 823 F 2
## 153 765 F 2
## 154 803 G 2
## 155 803 G 2
## 156 705 G 2
## 157 725 H 2
## 158 967 D 2
## 159 1050 E 2
## 160 967 F 2
## 161 863 F 2
## 162 800 F 2
## 163 842 G 2
## 164 800 G 2
## 165 758 H 2
## 166 880 D 2
## 167 880 G 2
## 168 705 G 2
## 169 638 G 2
## 170 919 D 2
## 171 1149 E 2
## 172 1057 F 2
## 173 919 G 2
## 174 1198 E 2
## 175 1248 E 2
## 176 1147 F 2
## 177 995 G 2
## 178 1108 H 2
## 179 1485 F 2
## 180 1283 G 2
## 181 1149 H 2
## 182 1082 I 2
## 183 1539 F 2
## 184 1365 F 2
## 185 1260 F 2
## 186 1121 I 2
## 187 1595 F 2
## 188 1233 H 2
## 189 1199 I 2
## 190 1471 G 2
## 191 1238 I 2
## 192 1580 E 2
## 193 1459 F 2
## 194 1459 G 2
## 195 1218 H 2
## 196 1299 I 2
## 197 1628 E 2
## 198 1628 F 2
## 199 1337 I 2
## 200 1462 H 2
## 201 1503 H 2
## 202 1773 F 2
## 203 1636 F 2
## 204 1821 F 2
## 205 1540 G 2
## 206 2276 G 2
## 207 1616 I 2
## 208 1506 I 2
## 209 2651 F 2
## 210 2383 F 2
## 211 3652 G 2
## 212 3722 E 2
## 213 3722 F 2
## 214 3095 I 2
## 215 3706 F 2
## 216 4070 E 2
## 217 3470 G 2
## 218 4831 E 2
## 219 4209 F 2
## 220 3821 G 2
## 221 5607 G 2
## 222 5326 G 2
## 223 6160 D 2
## 224 6095 F 2
## 225 5937 G 2
## 226 9342 H 2
## 227 9713 G 2
## 228 8873 H 2
## 229 8175 I 2
## 230 3778 F 3
## 231 3432 G 3
## 232 3851 F 3
## 233 3346 E 3
## 234 3130 H 3
## 235 3995 F 3
## 236 3701 F 3
## 237 3529 G 3
## 238 3667 F 3
## 239 3202 F 3
## 240 3256 F 3
## 241 3415 H 3
## 242 3792 H 3
## 243 3925 G 3
## 244 3421 G 3
## 245 3925 H 3
## 246 3616 H 3
## 247 3615 I 3
## 248 3785 H 3
## 249 3643 I 3
## 250 4300 H 3
## 251 6867 E 3
## 252 6285 E 3
## 253 5800 G 3
## 254 5510 G 3
## 255 4346 H 3
## 256 6372 G 3
## 257 5193 H 3
## 258 5662 H 3
## 259 5333 F 3
## 260 6041 G 3
## 261 5815 H 3
## 262 8611 F 3
## 263 6905 F 3
## 264 6905 G 3
## 265 6416 H 3
## 266 6051 H 3
## 267 8715 E 3
## 268 6988 E 3
## 269 6988 F 3
## 270 6495 G 3
## 271 7358 H 3
## 272 6572 F 3
## 273 7072 G 3
## 274 8359 F 3
## 275 6805 F 3
## 276 7711 G 3
## 277 5835 H 3
## 278 13775 D 3
## 279 14051 E 3
## 280 11419 E 3
## 281 10588 E 3
## 282 11696 F 3
## 283 10588 F 3
## 284 10450 G 3
## 285 9896 G 3
## 286 9203 G 3
## 287 9480 H 3
## 288 9065 H 3
## 289 8788 H 3
## 290 8788 I 3
## 291 8372 I 3
## 292 8095 I 3
## 293 7818 I 3
## 294 13909 D 3
## 295 11531 E 3
## 296 10692 E 3
## 297 11811 F 3
## 298 10272 F 3
## 299 9993 G 3
## 300 9293 G 3
## 301 9433 H 3
## 302 9153 H 3
## 303 8873 I 3
## 304 8175 I 3
## 305 10796 F 3
## 306 9890 H 3
## 307 8959 H 3
## 308 9107 I 3
Before explaining more in detail, considering results from G*Power, and extending the Null Hypothesis Significance Test (NHST) to two alternatives, I want to outline some key portions from the Recipes for Design of Experiments.
The original dataset for the diamonds case included several factors and response variables. However, the factors were chosen based primarily on the rules of ANOVA and block design, and personal preference with Best Guess in regards to predicting the effect. I needed to have independent variables (IV) that were categorical and dependent variables (DV) that were continuous.
The block design was placed onto one of the pair of the categorical IVs. Based on Best Guess, I blocked the certification because typically the pricing of a diamond depends more on the weight and color, than certification.
#Check the structure of the organized data
str(data)
## 'data.frame': 308 obs. of 3 variables:
## $ price : int 1302 1510 1510 1260 1641 1555 1427 1427 1126 1126 ...
## $ treatment: Factor w/ 6 levels "D","E","F","G",..: 1 2 4 4 1 2 3 4 5 6 ...
## $ block : int 1 1 1 1 1 1 1 1 1 1 ...
data$block = factor(data$block)
data$block
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [246] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [281] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3
In this experiment, the null hypothesis states that there is no difference between the color of the diamond (D,E,F,G,H,I) in regards to the price of the diamonds.
On the contrary, the alternative hypothesis states that there is a difference between the color of dimaond and the price of the diamonds.
For the purpose of this analyis, I chose a: Type I Error (alpha) of 0.05 because it is the most commonly used in practice for significance. Type II error (beta) of 0.1 because from the discussions in class, we learned that a power (1-beta) of 0.8 or greater is preferred. An increase in power means that there needs to be more sample size as well.
The rationale for this design was limited as I adapted a pre-existing experiment design dataset. The original factors and procedures to collect the data were done by the experimentor. Therefore, I had less flexibility and control over some aspects
However, the rationale for the block design was purely a simple case of a blocked design and study the results by performing ANOVA for the particular factor.
First of all, I assume the data collected was random because the diamond stores were chosen randomly and at a random time frame of the year 2000, including day and time.
Also, to ensure further randomization, I randomized the order of the data:
#Randomize the data
index = sample(1:nrow(diamond), 308, replace=FALSE)
rdata = diamond[index,]
rdata
## price color certification
## 144 10900 E GIA
## 157 725 H IGI
## 226 9342 H IGI
## 112 5386 H GIA
## 255 4346 H HRD
## 126 9169 G GIA
## 24 1255 H GIA
## 178 1108 H IGI
## 210 2383 F IGI
## 58 3651 G GIA
## 91 5122 F GIA
## 196 1299 I IGI
## 85 3900 E GIA
## 235 3995 F HRD
## 56 3667 F GIA
## 117 11419 D GIA
## 151 9646 H GIA
## 82 3432 G GIA
## 142 9666 F GIA
## 281 10588 E HRD
## 197 1628 E IGI
## 167 880 G IGI
## 128 8788 H GIA
## 238 3667 F HRD
## 289 8788 H HRD
## 293 7818 I HRD
## 113 4832 I GIA
## 93 5122 G GIA
## 148 9743 G GIA
## 28 1420 H GIA
## 50 3407 G GIA
## 97 5193 H GIA
## 20 1222 H GIA
## 172 1057 F IGI
## 158 967 D IGI
## 79 6709 H GIA
## 290 8788 I HRD
## 15 1693 E GIA
## 43 3490 D GIA
## 250 4300 H HRD
## 287 9480 H HRD
## 297 11811 F HRD
## 184 1365 F IGI
## 103 4727 I GIA
## 285 9896 G HRD
## 12 1202 G GIA
## 66 5122 G GIA
## 217 3470 G IGI
## 173 919 G IGI
## 26 1485 H GIA
## 253 5800 G HRD
## 204 1821 F IGI
## 33 1572 I GIA
## 137 8873 H GIA
## 161 863 F IGI
## 198 1628 F IGI
## 55 3529 G GIA
## 99 5662 H GIA
## 233 3346 E HRD
## 116 15582 D GIA
## 268 6988 E HRD
## 132 10692 E GIA
## 7 1427 F GIA
## 259 5333 F HRD
## 286 9203 G HRD
## 1 1302 D GIA
## 94 5122 H GIA
## 102 5030 H GIA
## 256 6372 G HRD
## 8 1427 G GIA
## 276 7711 G HRD
## 9 1126 H GIA
## 215 3706 F IGI
## 189 1199 I IGI
## 232 3851 F HRD
## 164 800 G IGI
## 302 9153 H HRD
## 298 10272 F HRD
## 2 1510 E GIA
## 241 3415 H HRD
## 180 1283 G IGI
## 263 6905 F HRD
## 222 5326 G IGI
## 162 800 F IGI
## 25 1635 F GIA
## 65 5510 F GIA
## 170 919 D IGI
## 245 3925 H HRD
## 292 8095 I HRD
## 108 4667 I GIA
## 129 8095 I GIA
## 231 3432 G HRD
## 52 4066 F GIA
## 282 11696 F HRD
## 261 5815 H HRD
## 295 11531 E HRD
## 80 6682 I GIA
## 39 3293 F GIA
## 75 4948 I GIA
## 306 9890 H HRD
## 107 7368 D GIA
## 159 1050 E IGI
## 134 10272 F GIA
## 98 5961 E GIA
## 41 3567 F GIA
## 6 1555 E GIA
## 262 8611 F HRD
## 119 9757 E GIA
## 169 638 G IGI
## 283 10588 F HRD
## 212 3722 E IGI
## 60 4291 F GIA
## 22 1593 F GIA
## 269 6988 F HRD
## 51 3767 E GIA
## 45 3635 F GIA
## 18 1269 G GIA
## 81 3501 E GIA
## 224 6095 F IGI
## 130 7818 I GIA
## 274 8359 F HRD
## 141 10372 F GIA
## 181 1149 H IGI
## 192 1580 E IGI
## 218 4831 E IGI
## 70 5586 F GIA
## 208 1506 I IGI
## 106 4585 H GIA
## 86 3415 H GIA
## 280 11419 E HRD
## 63 4759 G GIA
## 234 3130 H HRD
## 69 5881 F GIA
## 115 7680 F GIA
## 296 10692 E HRD
## 152 823 F IGI
## 73 5263 F GIA
## 240 3256 F HRD
## 242 3792 H HRD
## 303 8873 I HRD
## 291 8372 I HRD
## 284 10450 G HRD
## 136 9153 H GIA
## 53 4138 E GIA
## 247 3615 I HRD
## 176 1147 F IGI
## 271 7358 H HRD
## 145 10571 F GIA
## 213 3722 F IGI
## 252 6285 E HRD
## 109 4355 I GIA
## 87 4291 H GIA
## 23 1447 G GIA
## 273 7072 G HRD
## 165 758 H IGI
## 223 6160 D IGI
## 225 5937 G IGI
## 48 3701 F GIA
## 34 2942 E GIA
## 125 9619 G GIA
## 308 9107 I HRD
## 171 1149 E IGI
## 277 5835 H HRD
## 191 1238 I IGI
## 78 6882 H GIA
## 13 1327 E GIA
## 14 1098 I GIA
## 209 2651 F IGI
## 155 803 G IGI
## 305 10796 F HRD
## 72 5193 H GIA
## 153 765 F IGI
## 207 1616 I IGI
## 54 3605 F GIA
## 40 3016 G GIA
## 183 1539 F IGI
## 71 5193 F GIA
## 288 9065 H HRD
## 138 8873 I GIA
## 4 1260 G GIA
## 100 5738 E GIA
## 270 6495 G HRD
## 304 8175 I HRD
## 3 1510 G GIA
## 68 3861 I GIA
## 214 3095 I IGI
## 32 1747 H GIA
## 67 5122 H GIA
## 61 5845 E GIA
## 46 3418 F GIA
## 123 9480 F GIA
## 143 10090 G GIA
## 11 1468 F GIA
## 246 3616 H HRD
## 278 13775 D HRD
## 30 1525 H GIA
## 205 1540 G IGI
## 301 9433 H HRD
## 154 803 G IGI
## 220 3821 G IGI
## 239 3202 F HRD
## 111 6919 F GIA
## 31 1956 F GIA
## 266 6051 H HRD
## 76 5705 H GIA
## 49 3480 F GIA
## 83 3851 F GIA
## 168 705 G IGI
## 47 3921 D GIA
## 182 1082 I IGI
## 228 8873 H IGI
## 140 7895 I GIA
## 17 1410 G GIA
## 96 5881 E GIA
## 202 1773 F IGI
## 237 3529 G HRD
## 62 4401 G GIA
## 200 1462 H IGI
## 267 8715 E HRD
## 124 9896 G GIA
## 147 8781 I GIA
## 272 6572 F HRD
## 279 14051 E HRD
## 38 3501 F GIA
## 227 9713 G IGI
## 27 1420 F GIA
## 84 3605 H GIA
## 265 6416 H HRD
## 236 3701 F HRD
## 149 9302 H GIA
## 185 1260 F IGI
## 135 9573 F GIA
## 199 1337 I IGI
## 127 9203 G GIA
## 201 1503 H IGI
## 160 967 F IGI
## 166 880 D IGI
## 163 842 G IGI
## 300 9293 G HRD
## 133 9853 E GIA
## 121 10588 F GIA
## 258 5662 H HRD
## 110 9885 D GIA
## 260 6041 G HRD
## 179 1485 F IGI
## 36 3501 E GIA
## 249 3643 I HRD
## 150 8945 I GIA
## 42 3205 G GIA
## 307 8959 H HRD
## 174 1198 E IGI
## 35 2532 G GIA
## 105 5815 G GIA
## 122 10713 F GIA
## 206 2276 G IGI
## 175 1248 E IGI
## 221 5607 G IGI
## 139 8455 I GIA
## 101 5030 H GIA
## 5 1641 D GIA
## 95 6372 D GIA
## 230 3778 F HRD
## 74 5441 I GIA
## 211 3652 G IGI
## 29 1911 F GIA
## 299 9993 G HRD
## 243 3925 G HRD
## 219 4209 F IGI
## 294 13909 D HRD
## 264 6905 G HRD
## 186 1121 I IGI
## 193 1459 F IGI
## 194 1459 G IGI
## 244 3421 G HRD
## 92 5122 F GIA
## 59 3773 G GIA
## 114 7156 E GIA
## 146 9563 I GIA
## 275 6805 F HRD
## 187 1595 F IGI
## 229 8175 I IGI
## 16 1551 F GIA
## 57 2892 I GIA
## 19 1316 H GIA
## 44 3635 E GIA
## 89 5800 E GIA
## 216 4070 E IGI
## 195 1218 H IGI
## 177 995 G IGI
## 254 5510 G HRD
## 131 16008 D GIA
## 77 6805 F GIA
## 190 1471 G IGI
## 118 10588 E GIA
## 10 1126 I GIA
## 188 1233 H IGI
## 21 1738 E GIA
## 203 1636 F IGI
## 37 3501 E GIA
## 64 4300 H GIA
## 248 3785 H HRD
## 90 6285 F GIA
## 156 705 G IGI
## 251 6867 E HRD
## 88 6512 E GIA
## 257 5193 H HRD
## 104 4221 I GIA
## 120 13913 F GIA
Therefore, there were 3 levels of randomization: measurement, assignment, and performing.
Due to the simplification of the dataset, which was a cross-section during 2000, I cannot verify if there were any replicates and/or repeated measures.
As mentioned previously, the design definitely includes blocking. The IV factor (certification) was blocked to reduce the nuisance factor in how the certification process may vary among the institutions due to differing perspectives and industry standards, while color of a diamond can be viewed more or less with less conflicts and consensus. An ANOVA test was examined, with the corresponding results for certification neglected.
Below is the descriptive summary of the data.
#Display the descriptive statistics
summary(diamond)
## price color certification
## Min. : 638 D:16 GIA:151
## 1st Qu.: 1625 E:44 HRD: 79
## Median : 4215 F:82 IGI: 78
## Mean : 5019 G:65
## 3rd Qu.: 7446 H:61
## Max. :16008 I:40
Also, to take a closer look at the general overview of the data, I plotted a histogram of the price of diamonds. The histogram seems to have a right-skew and mostly clustered within the approximate range of 0 and 5000.
#Histogram and Boxplot of Price
hist(diamond$price, main="Histogram of Diamond Prices")
boxplot(diamond$price, main="Boxplot of Diamond Prices")
You can see the different certifications labelled as G, I, and H.
#Plot for Certification
plot(diamond$price, pch=as.character(diamond$certification))
From this graphic, the IGI certification seemed to be not have as many expensive diamonds as the other certifications.
boxplot(diamond$price~diamond$certification, main = "Price vs Certification", xlab = "Price in Singapore $",ylab="Certification")
The me was calculated in a similar fashion as the previous project. Here, I considered the non-blocking IV, which was the color.
#Calculating the main effect to get the effect size (later confirm with G*Power)
#Take the difference between the means of the non-blocking IV (color)
m1 <- mean(subset(diamond$price, diamond$color=="D"))
m2 <- mean(subset(diamond$price, diamond$color=="E"))
m3 <- mean(subset(diamond$price, diamond$color=="F"))
m4 <- mean(subset(diamond$price, diamond$color=="G"))
m5 <- mean(subset(diamond$price, diamond$color=="H"))
m6 <- mean(subset(diamond$price, diamond$color=="I"))
me <- m6 - m5 - m4 - m3 - m2 - m1
#Print the result
me
## [1] -21610.39
me = -21610.39
There was a difference between the means. A boxplot below was plotted to elaborate on the me.
#Boxplots
boxplot(diamond$price~diamond$color, main = "Price vs Color", xlab = "Price in Singapore $",ylab="Color")
First of all, from the boxplot, there were no outliers present. Also, the median for D was higher than most of the other colors, followed by H,I,E,F, and then G. F and G were fairly close, but the variance or range differed.
I first found the effect size by hand using R, and then later confirmed it with G*Power. The equation to compute the effect size was straightforward because it was taking the differences of the averages and divided by the standard deviation.
m1 <- mean(diamond$color=="D")
m2 <- mean(diamond$color=="E")
m3 <- mean(diamond$color=="F")
m4 <- mean(diamond$color=="G")
m5 <- mean(diamond$color=="H")
m6 <- mean(diamond$color=="I")
diff_mean = m6 - m5 - m4 - m3 -m2 -m1
sd1 <- sd(diamond$color=="D")
sd2 <- sd(diamond$color=="E")
sd3 <- sd(diamond$color=="F")
sd4 <- sd(diamond$color=="G")
sd5 <- sd(diamond$color=="H")
sd6 <- sd(diamond$color=="I")
sd = (sd1+sd2+sd3+sd4+sd5+sd6)/2
result = abs(diff_mean/sd)
result
## [1] 0.6853988
As mentioned before, Type I Error (α) = 0.05 Type II error (β) = 0.1.
Effect size = 0.6853988
On G*Power, I kept the other conditions the same, except the ‘Type of power analysis’ to A Priori.
The following parameters were added into G*Power:
Effect Size = 0.6853988 Type I Error (α) = 0.05 Power = 0.9 Number of Groups = 3
Below is the screenshot:
Effect Size
From the G*Power output, I found that the critical F-statistics was 3.0253 and the sample size N needed to be 309. N=309 would be the amount needed to achieve the desired power.
Now that I got the required sample size needed, I decided to randomly select from the data.
#Set the right sample size
N = 33
set.seed(8)
#rdiamond = new diamond data
rdiamond = diamond[sample(nrow(diamond),N),]
First of all, I wanted to show the ANOVA result that was before the selected N experimental cases at random.
Remember: treatment: color block: certification (3 groups)
#Randomized Complete Block Design with ANOVA Analysis
fit.lm = lm(price ~ treatment+block, data=data)
anova(fit.lm)
## Analysis of Variance Table
##
## Response: price
## Df Sum Sq Mean Sq F value Pr(>F)
## treatment 5 108229779 21645956 2.638 0.02364 *
## block 2 985593642 492796821 60.058 < 2e-16 ***
## Residuals 300 2461603926 8205346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit.lm)
##
## Call:
## lm(formula = price ~ treatment + block, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6286.7 -1604.7 -557.2 1675.7 8765.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7588.7 727.3 10.435 < 2e-16 ***
## treatmentE -1936.6 837.2 -2.313 0.021384 *
## treatmentF -2441.0 784.4 -3.112 0.002037 **
## treatmentG -2689.6 802.9 -3.350 0.000912 ***
## treatmentH -2873.3 809.5 -3.550 0.000448 ***
## treatmentI -2129.4 848.3 -2.510 0.012589 *
## block2 -2978.4 402.9 -7.392 1.45e-12 ***
## block3 2045.9 401.0 5.102 5.98e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2864 on 300 degrees of freedom
## Multiple R-squared: 0.3076, Adjusted R-squared: 0.2915
## F-statistic: 19.04 on 7 and 300 DF, p-value: < 2.2e-16
#Plots
plot(fit.lm)
interaction.plot(data$treatment, data$block, data$price)
Using the sample size, N=33, I used the ndiamond. The 2-way ANOVA test was then performed. Since the certification was blocked, I ignored the F statistics.
#ANOVA
anova = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 106615419 21323084 2.866 0.0352 *
## rdiamond$certification 2 52942085 26471043 3.558 0.0437 *
## Residuals 25 185998031 7439921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The null hypothesis was that there is no difference between the color of the diamond in regards to the price of the diamonds. The results from the certification would be ignored because it was the blocking variable. From the ANOVA results, the F-statistic for the treatment (color) was 2.866 and the p-value was 0.0352. The alpha value was set to 0.05. The p-value was much less than the alpha value and the F value was not close to 0.
Therefore, since the p-value < alpha, we reject the null hypothesis, and the result was statistically significant. The certification effect was not the research question, so it was neglected.
Null Hypothesis Statistical Testing (NHST) was explored to practice the skills and apply them to the experiment.
Multiple models are used to minimize the importance of the need to reject the null hypothesis. It specifies how well the data fits the set of models. The data are not to be assumed that the they would be derived directly from the null hypothesis. In this case where the outcome was significant, I want to find a family of model that shows the opposite outcome, so insignificant. Since I had significance, I want to find insignificance. The purpose of multiple models is to show that the results are truly significant or insignificant (depending on your results) by showing a model or family of models for which I would not get the same significant results.
From the original model, I found that the results were significant.
I first looked at the interactions.
#Check Interaction
mm1 = aov(rdiamond$price ~ rdiamond$color*rdiamond$certification)
summary(mm1)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 106615419 21323084 2.590 0.0580
## rdiamond$certification 2 52942085 26471043 3.215 0.0615
## rdiamond$color:rdiamond$certification 5 21341985 4268397 0.518 0.7593
## Residuals 20 164656046 8232802
##
## rdiamond$color .
## rdiamond$certification .
## rdiamond$color:rdiamond$certification
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The interactions showed some significance, but the interaction was fairly insignificant. This strengthens the assumption that there was no interaction between the blocking and response variable.
Then, I looked at the case, where I transformed the outcome of the Y by increasing the power to 2.
rdiamond$price = (rdiamond$price)^2
mm2= aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm2)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 1.219e+16 2.438e+15 2.117 0.0967 .
## rdiamond$certification 2 4.161e+15 2.081e+15 1.807 0.1850
## Residuals 25 2.879e+16 1.152e+15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Here, I noticed that the results were insignificant.
I tried more cases. For this, I multiplied the coefficient 4.
rdiamond$price = (rdiamond$price)*4
mm3 = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm3)
## Df Sum Sq Mean Sq F value Pr(>F)
## rdiamond$color 5 1.950e+17 3.900e+16 2.117 0.0967 .
## rdiamond$certification 2 6.658e+16 3.329e+16 1.807 0.1850
## Residuals 25 4.607e+17 1.843e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The results were insignificant.
I also looked into log.
N = 33
mdiamond = diamond[sample(nrow(diamond),N),]
mdiamond$price = log(mdiamond$price)
mm4 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification)
summary(mm4)
## Df Sum Sq Mean Sq F value Pr(>F)
## mdiamond$color 4 1.359 0.3398 0.740 0.5735
## mdiamond$certification 2 3.714 1.8572 4.043 0.0296 *
## Residuals 26 11.944 0.4594
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interestingly, the results were insignificant, while the certification showed significance.
Next, I looked into squaring the certification.
mdiamond$certification2 = as.integer(mdiamond$certification)^2
mm5 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification2)
summary(mm5)
## Df Sum Sq Mean Sq F value Pr(>F)
## mdiamond$color 4 1.359 0.3398 0.586 0.675
## mdiamond$certification2 1 0.003 0.0030 0.005 0.943
## Residuals 27 15.655 0.5798
This made the results insignificant as well.
Finally, I considered other fixed effect models, where I added a new factor, clarity.
test = read.csv("nmodel.csv", header = TRUE, sep=",")
N = 33
nm = test[sample(nrow(test),N),]
mm6 = aov(test$price ~ test$clarity+test$certification)
summary(mm6)
## Df Sum Sq Mean Sq F value Pr(>F)
## test$clarity 4 2.997e+08 74917212 8.826 9.46e-07 ***
## test$certification 2 7.009e+08 350460667 41.290 < 2e-16 ***
## Residuals 301 2.555e+09 8487831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This time, the results were significant. The clarity seemed to have affected the price of diamond a lot. From an intuitive point of view, this makes sense.
Therefore, with multiple models I was able to show different family of models that predicted no difference, positive difference, and negative difference. In other words, I was able to show that other models can make the results insignificant, contrast to my significance.
The second NHST I considered was confidence intervals. Confidence intervals are a medium or tool to assess the probability that a parameter lies within a certain range of values. It is used to appraise results found. To interpret confidence intervals, I would use to see where the values fall under this particular range of values. It estimates the degree to which the values is most likely going to be the true value. I prefer confidence intervals over effect size because it can replace significance testing.
First, I used confint to see the confidence interval.
confint(anova)
## 2.5 % 97.5 %
## (Intercept) -4315.6425 6919.6425
## rdiamond$colorE 750.8011 12825.7181
## rdiamond$colorF -2575.6021 9585.2104
## rdiamond$colorG -2337.4214 10092.1483
## rdiamond$colorH -1273.2912 10817.0493
## rdiamond$colorI -1135.7277 13079.3409
## rdiamond$certificationHRD -2768.7008 2674.0127
## rdiamond$certificationIGI -6608.6126 -798.3339
Then, I used Tukey’s Method. Tukey’s Method creates confidence intervals for paired difference among factor levels and helps control the error rate, which I specified as 0.05.
tukey = TukeyHSD(anova)
plot(tukey)
TukeyHSD(anova)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = rdiamond$price ~ rdiamond$color + rdiamond$certification)
##
## $`rdiamond$color`
## diff lwr upr p adj
## E-D 6252.4286 -2733.905 15238.7617 0.2981618
## F-D 1853.5556 -7007.088 10714.1995 0.9862680
## G-D 3127.2000 -6081.051 12335.4513 0.8973377
## H-D 4754.1250 -4161.726 13669.9760 0.5793071
## I-D 2268.3333 -7438.016 11974.6825 0.9776018
## F-E -4398.8730 -8635.071 -162.6749 0.0385353
## G-E -3125.2286 -8047.246 1796.7888 0.3935379
## H-E -1498.3036 -5848.793 2852.1863 0.8918532
## I-E -3984.0952 -9784.748 1816.5579 0.3111946
## G-F 1273.6444 -3414.968 5962.2565 0.9575113
## H-F 2900.5694 -1183.986 6985.1245 0.2779638
## I-F 414.7778 -5189.186 6018.7411 0.9999037
## H-G 1626.9250 -3165.201 6419.0509 0.8974604
## I-G -858.8667 -6997.701 5279.9676 0.9978711
## I-H -2485.7917 -8176.643 3205.0600 0.7571489
##
## $`rdiamond$certification`
## diff lwr upr p adj
## HRD-GIA -194.6715 -3397.414 3008.0710 0.987449
## IGI-GIA -2237.9506 -5011.607 535.7057 0.130649
## IGI-HRD -2043.2791 -5624.054 1537.4958 0.345518
The confidence interval for the certification contained zero. A zero in a confidence interval range means that there is no difference. In other words, the confidence interval that contained zero means that there is no statistical significance. For my case, there were inconsistencies and consistencies depending on the family model I looked with my ANOVA tests because certification was not significant in some.
I estimated the estimation of the parameters for the N=33.
fit.lm2 = lm(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(fit.lm2)
##
## Call:
## lm(formula = rdiamond$price ~ rdiamond$color + rdiamond$certification)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233227043 -87657323 -5532014 65925768 237594233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6780816 135746198 0.050 0.9606
## rdiamond$colorE 275474231 145890743 1.888 0.0707 .
## rdiamond$colorF 116416918 146928544 0.792 0.4356
## rdiamond$colorG 135329147 150175704 0.901 0.3761
## rdiamond$colorH 174864663 146077092 1.197 0.2425
## rdiamond$colorI 219826615 171748338 1.280 0.2123
## rdiamond$certificationHRD -16358766 65759585 -0.249 0.8056
## rdiamond$certificationIGI -133199705 70200556 -1.897 0.0694 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 135700000 on 25 degrees of freedom
## Multiple R-squared: 0.3622, Adjusted R-squared: 0.1836
## F-statistic: 2.028 on 7 and 25 DF, p-value: 0.0913
Note: [Some outputs were deleted.]
coef(anova)
## (Intercept) rdiamond$colorE
## 1302.00000 6788.25961
## rdiamond$colorF rdiamond$colorG
## 3504.80412 3877.36346
## rdiamond$colorH rdiamond$colorI
## 4771.87901 5971.80659
## rdiamond$certificationHRD rdiamond$certificationIGI
## -47.34404 -3703.47326
To check if the normality assumption holds, I used the qqnorm and qqlines.
#Check for Normality
qqnorm(residuals(fit.lm2))
qqline(residuals(fit.lm2))
#Residual Plot
plot(fitted(anova),residuals(anova))
Based on the qqnorm and qqline, the qqnorm line looks fairly linear and closely fits the qqline drawn. Also, the residual plot showed an evenly distributed residual as a function of fitted values. Also, there was a good scatter and not close to 0. The pattern seemed random and evenly distributed, two important conditions that satisfy the normality check. Therefore, the normality assumption on the IV was satisfied because the residuals were evenly distributed and the qqnorm line is linear enough. The normality check passed. There is no need to do transformation because the normality assumption was passed.
Summary: Overall, the experiment finding was an interesting one. Based on the p-values, we learned that the color of the diamond does affect the price. The study focused on a blocked design of certification. Therefore, we rejected the null hypothesis. There was a difference between the color of the diamond in regards to the price of the diamonds. However, by exploring different family of models through multiple models, we were able to see changes in significance. In general, the color of the diamond seemed to have an effect on the price of the diamond.
In terms of the model, I would say this was a mixed effects model because there was randomization from the data collection and the variables were selected. However, some might argue it was a fixed effects model because the data might not have represented the general population.
For the findings, it was interesting to explore the multiple models.
The analysis made sense and the block design was well executed.
http://www.stat.purdue.edu/~xuanyaoh/stat350/xyMar26Lec23.pdf
http://people.stat.sfu.ca/~thompson/stat403-650/randomized_block.html
http://www.r-tutor.com/elementary-statistics/analysis-variance/randomized-block-design
https://vincentarelbundock.github.io/Rdatasets/doc/Ecdat/Diamond.html
https://statistics.laerd.com/spss-tutorials/multiple-regression-using-spss-statistics.php
#Benjamin Byeon
#10/30/16
#Section 1
#Load the data
diamond = read.csv("diamonds.csv", header = TRUE, sep=",")
diamond
#System under test
head(diamond)
tail(diamond)
names(diamond)
#Display the structure
str(diamond)
#Check the levels
levels(diamond$price)
levels(diamond$color)
levels(diamond$certification)
#Randomize the data
index = sample(1:nrow(diamond), 308, replace=FALSE)
rdata = diamond[index,]
rdata
#Display the descriptive statistics
summary(diamond)
#Histogram
hist(diamond$price, main="Histogram of Diamond Prices")
#Plot for certification
plot(diamond$price, pch=as.character(diamond$certification))
#Boxplots
boxplot(diamond$price~diamond$color, main = "Price vs Color", xlab = "Price in Singapore $",ylab="Color")
boxplot(diamond$price~diamond$certification, main = "Price vs Certification", xlab = "Price in Singapore $",ylab="Certification")
#Calculating the main effect to get the effect size (rather confirm with G*Power)
#Take the difference between the means of the non-blocking IV (color)
m1 <- mean(subset(diamond$price, diamond$color=="D"))
m2 <- mean(subset(diamond$price, diamond$color=="E"))
m3 <- mean(subset(diamond$price, diamond$color=="F"))
m4 <- mean(subset(diamond$price, diamond$color=="G"))
m5 <- mean(subset(diamond$price, diamond$color=="H"))
m6 <- mean(subset(diamond$price, diamond$color=="I"))
me <- m6 - m5 - m4 - m3 - m2 - m1
#Print the result
me
#Effect Size
m1 <- mean(diamond$color=="D")
m2 <- mean(diamond$color=="E")
m3 <- mean(diamond$color=="F")
m4 <- mean(diamond$color=="G")
m5 <- mean(diamond$color=="H")
m6 <- mean(diamond$color=="I")
diff_mean = m6 - m5 - m4 - m3 -m2 -m1
sd1 <- sd(diamond$color=="D")
sd2 <- sd(diamond$color=="E")
sd3 <- sd(diamond$color=="F")
sd4 <- sd(diamond$color=="G")
sd5 <- sd(diamond$color=="H")
sd6 <- sd(diamond$color=="I")
sd = (sd1+sd2+sd3+sd4+sd5+sd6)/2
result = abs(diff_mean/sd)
result
#Organize the data to treatments and block
data = read.csv("block.csv", header = TRUE, sep=",")
data
#Check the structure of the organized data
str(data)
data$block = factor(data$block)
data$block
#Randomized Complete Block Design with ANOVA Analysis
fit.lm = lm(price ~ treatment+block, data=data)
anova(fit.lm)
#Plots
plot(fit.lm)
interaction.plot(data$treatment, data$block, data$price)
#Set the right sample size
N = 33
set.seed(8)
#rdiamond = new diamond data
rdiamond = diamond[sample(nrow(diamond),N),]
#ANOVA
anova = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(anova)
#Multiple models
#Check Interaction
mm1 = aov(rdiamond$price ~ rdiamond$color*rdiamond$certification)
summary(mm1)
#Insignificant - original was significant
rdiamond$price = (rdiamond$price)^2
mm2= aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm2)
#Insignificant
rdiamond$price = (rdiamond$price)*4
mm3 = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm3)
#Insignificant
N = 33
mdiamond = diamond[sample(nrow(diamond),N),]
mdiamond$price = log(mdiamond$price)
mm4 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification)
summary(mm4)
#Insignificant
mdiamond$certification2 = as.integer(mdiamond$certification)^2
mm5 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification2)
summary(mm5)
#Other fixed effect model - Significant
test = read.csv("nmodel.csv", header = TRUE, sep=",")
N = 33
nm = test[sample(nrow(test),N),]
mm6 = aov(test$price ~ test$clarity+test$certification)
summary(mm6)
#Confidence Interval
confint(anova1)
tukey = TukeyHSD(anova1)
plot(tukey)
TukeyHSD(anova)
#Estimation
fit.lm2 = lm(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(fit.lm2)
coef(anova)
#Check for Normality
qqnorm(residuals(fit.lm2))
qqline(residuals(fit.lm2))
#Residual Plot
plot(fitted(anova),residuals(anova))