DOE Project2

Benjamin Byeon

Rensselaer Polytechnic Institute - Troy, NY

V1.10.30.16

1. Setting

From the Ecdat Package, I selected a dataset that contained N=308 observations. The data looks into different factors that affect the pricing of the diamond carats. However, the study selected a pair of categorical independent variables and a continuous variable dependent variable. Also, the data was organized so that one independent variable was blocked. Further analysis were made regarding the study.

The data was copied and pasted into Excel, and then saved as a .csv. Shown below is the loaded data into R.

#Load the data
diamond = read.csv("diamonds.csv", header = TRUE, sep=",")
diamond
##     price color certification
## 1    1302     D           GIA
## 2    1510     E           GIA
## 3    1510     G           GIA
## 4    1260     G           GIA
## 5    1641     D           GIA
## 6    1555     E           GIA
## 7    1427     F           GIA
## 8    1427     G           GIA
## 9    1126     H           GIA
## 10   1126     I           GIA
## 11   1468     F           GIA
## 12   1202     G           GIA
## 13   1327     E           GIA
## 14   1098     I           GIA
## 15   1693     E           GIA
## 16   1551     F           GIA
## 17   1410     G           GIA
## 18   1269     G           GIA
## 19   1316     H           GIA
## 20   1222     H           GIA
## 21   1738     E           GIA
## 22   1593     F           GIA
## 23   1447     G           GIA
## 24   1255     H           GIA
## 25   1635     F           GIA
## 26   1485     H           GIA
## 27   1420     F           GIA
## 28   1420     H           GIA
## 29   1911     F           GIA
## 30   1525     H           GIA
## 31   1956     F           GIA
## 32   1747     H           GIA
## 33   1572     I           GIA
## 34   2942     E           GIA
## 35   2532     G           GIA
## 36   3501     E           GIA
## 37   3501     E           GIA
## 38   3501     F           GIA
## 39   3293     F           GIA
## 40   3016     G           GIA
## 41   3567     F           GIA
## 42   3205     G           GIA
## 43   3490     D           GIA
## 44   3635     E           GIA
## 45   3635     F           GIA
## 46   3418     F           GIA
## 47   3921     D           GIA
## 48   3701     F           GIA
## 49   3480     F           GIA
## 50   3407     G           GIA
## 51   3767     E           GIA
## 52   4066     F           GIA
## 53   4138     E           GIA
## 54   3605     F           GIA
## 55   3529     G           GIA
## 56   3667     F           GIA
## 57   2892     I           GIA
## 58   3651     G           GIA
## 59   3773     G           GIA
## 60   4291     F           GIA
## 61   5845     E           GIA
## 62   4401     G           GIA
## 63   4759     G           GIA
## 64   4300     H           GIA
## 65   5510     F           GIA
## 66   5122     G           GIA
## 67   5122     H           GIA
## 68   3861     I           GIA
## 69   5881     F           GIA
## 70   5586     F           GIA
## 71   5193     F           GIA
## 72   5193     H           GIA
## 73   5263     F           GIA
## 74   5441     I           GIA
## 75   4948     I           GIA
## 76   5705     H           GIA
## 77   6805     F           GIA
## 78   6882     H           GIA
## 79   6709     H           GIA
## 80   6682     I           GIA
## 81   3501     E           GIA
## 82   3432     G           GIA
## 83   3851     F           GIA
## 84   3605     H           GIA
## 85   3900     E           GIA
## 86   3415     H           GIA
## 87   4291     H           GIA
## 88   6512     E           GIA
## 89   5800     E           GIA
## 90   6285     F           GIA
## 91   5122     F           GIA
## 92   5122     F           GIA
## 93   5122     G           GIA
## 94   5122     H           GIA
## 95   6372     D           GIA
## 96   5881     E           GIA
## 97   5193     H           GIA
## 98   5961     E           GIA
## 99   5662     H           GIA
## 100  5738     E           GIA
## 101  5030     H           GIA
## 102  5030     H           GIA
## 103  4727     I           GIA
## 104  4221     I           GIA
## 105  5815     G           GIA
## 106  4585     H           GIA
## 107  7368     D           GIA
## 108  4667     I           GIA
## 109  4355     I           GIA
## 110  9885     D           GIA
## 111  6919     F           GIA
## 112  5386     H           GIA
## 113  4832     I           GIA
## 114  7156     E           GIA
## 115  7680     F           GIA
## 116 15582     D           GIA
## 117 11419     D           GIA
## 118 10588     E           GIA
## 119  9757     E           GIA
## 120 13913     F           GIA
## 121 10588     F           GIA
## 122 10713     F           GIA
## 123  9480     F           GIA
## 124  9896     G           GIA
## 125  9619     G           GIA
## 126  9169     G           GIA
## 127  9203     G           GIA
## 128  8788     H           GIA
## 129  8095     I           GIA
## 130  7818     I           GIA
## 131 16008     D           GIA
## 132 10692     E           GIA
## 133  9853     E           GIA
## 134 10272     F           GIA
## 135  9573     F           GIA
## 136  9153     H           GIA
## 137  8873     H           GIA
## 138  8873     I           GIA
## 139  8455     I           GIA
## 140  7895     I           GIA
## 141 10372     F           GIA
## 142  9666     F           GIA
## 143 10090     G           GIA
## 144 10900     E           GIA
## 145 10571     F           GIA
## 146  9563     I           GIA
## 147  8781     I           GIA
## 148  9743     G           GIA
## 149  9302     H           GIA
## 150  8945     I           GIA
## 151  9646     H           GIA
## 152   823     F           IGI
## 153   765     F           IGI
## 154   803     G           IGI
## 155   803     G           IGI
## 156   705     G           IGI
## 157   725     H           IGI
## 158   967     D           IGI
## 159  1050     E           IGI
## 160   967     F           IGI
## 161   863     F           IGI
## 162   800     F           IGI
## 163   842     G           IGI
## 164   800     G           IGI
## 165   758     H           IGI
## 166   880     D           IGI
## 167   880     G           IGI
## 168   705     G           IGI
## 169   638     G           IGI
## 170   919     D           IGI
## 171  1149     E           IGI
## 172  1057     F           IGI
## 173   919     G           IGI
## 174  1198     E           IGI
## 175  1248     E           IGI
## 176  1147     F           IGI
## 177   995     G           IGI
## 178  1108     H           IGI
## 179  1485     F           IGI
## 180  1283     G           IGI
## 181  1149     H           IGI
## 182  1082     I           IGI
## 183  1539     F           IGI
## 184  1365     F           IGI
## 185  1260     F           IGI
## 186  1121     I           IGI
## 187  1595     F           IGI
## 188  1233     H           IGI
## 189  1199     I           IGI
## 190  1471     G           IGI
## 191  1238     I           IGI
## 192  1580     E           IGI
## 193  1459     F           IGI
## 194  1459     G           IGI
## 195  1218     H           IGI
## 196  1299     I           IGI
## 197  1628     E           IGI
## 198  1628     F           IGI
## 199  1337     I           IGI
## 200  1462     H           IGI
## 201  1503     H           IGI
## 202  1773     F           IGI
## 203  1636     F           IGI
## 204  1821     F           IGI
## 205  1540     G           IGI
## 206  2276     G           IGI
## 207  1616     I           IGI
## 208  1506     I           IGI
## 209  2651     F           IGI
## 210  2383     F           IGI
## 211  3652     G           IGI
## 212  3722     E           IGI
## 213  3722     F           IGI
## 214  3095     I           IGI
## 215  3706     F           IGI
## 216  4070     E           IGI
## 217  3470     G           IGI
## 218  4831     E           IGI
## 219  4209     F           IGI
## 220  3821     G           IGI
## 221  5607     G           IGI
## 222  5326     G           IGI
## 223  6160     D           IGI
## 224  6095     F           IGI
## 225  5937     G           IGI
## 226  9342     H           IGI
## 227  9713     G           IGI
## 228  8873     H           IGI
## 229  8175     I           IGI
## 230  3778     F           HRD
## 231  3432     G           HRD
## 232  3851     F           HRD
## 233  3346     E           HRD
## 234  3130     H           HRD
## 235  3995     F           HRD
## 236  3701     F           HRD
## 237  3529     G           HRD
## 238  3667     F           HRD
## 239  3202     F           HRD
## 240  3256     F           HRD
## 241  3415     H           HRD
## 242  3792     H           HRD
## 243  3925     G           HRD
## 244  3421     G           HRD
## 245  3925     H           HRD
## 246  3616     H           HRD
## 247  3615     I           HRD
## 248  3785     H           HRD
## 249  3643     I           HRD
## 250  4300     H           HRD
## 251  6867     E           HRD
## 252  6285     E           HRD
## 253  5800     G           HRD
## 254  5510     G           HRD
## 255  4346     H           HRD
## 256  6372     G           HRD
## 257  5193     H           HRD
## 258  5662     H           HRD
## 259  5333     F           HRD
## 260  6041     G           HRD
## 261  5815     H           HRD
## 262  8611     F           HRD
## 263  6905     F           HRD
## 264  6905     G           HRD
## 265  6416     H           HRD
## 266  6051     H           HRD
## 267  8715     E           HRD
## 268  6988     E           HRD
## 269  6988     F           HRD
## 270  6495     G           HRD
## 271  7358     H           HRD
## 272  6572     F           HRD
## 273  7072     G           HRD
## 274  8359     F           HRD
## 275  6805     F           HRD
## 276  7711     G           HRD
## 277  5835     H           HRD
## 278 13775     D           HRD
## 279 14051     E           HRD
## 280 11419     E           HRD
## 281 10588     E           HRD
## 282 11696     F           HRD
## 283 10588     F           HRD
## 284 10450     G           HRD
## 285  9896     G           HRD
## 286  9203     G           HRD
## 287  9480     H           HRD
## 288  9065     H           HRD
## 289  8788     H           HRD
## 290  8788     I           HRD
## 291  8372     I           HRD
## 292  8095     I           HRD
## 293  7818     I           HRD
## 294 13909     D           HRD
## 295 11531     E           HRD
## 296 10692     E           HRD
## 297 11811     F           HRD
## 298 10272     F           HRD
## 299  9993     G           HRD
## 300  9293     G           HRD
## 301  9433     H           HRD
## 302  9153     H           HRD
## 303  8873     I           HRD
## 304  8175     I           HRD
## 305 10796     F           HRD
## 306  9890     H           HRD
## 307  8959     H           HRD
## 308  9107     I           HRD

System under test

The dataset helps determine the effects of color and certification to the pricing of the diamond. Initially, the dataset consisted of many factors. However, I chose two independent variables that were of interest based purely on my preference.

I selected the color and certification to examine their effects. In addition, I chose which indepdendent variable to block. I blocked the variable that seemed to have less of an effect on price. Therefore, I decided to block certification because I thought certification had to do more with the institution certifying the diamond and less with the value of the diamond itself, like the color of the diamon. I was interested in studying a Randomized Complete Block Design.

Below are the definitions of the variables:

price: the price of the diamond in Singapore $

color: the color of the diamond with a factor of levels (D,E,F,G,H,I)

certification: the certification body with a factor of levels (GIA,IGI,HRD)

#Display the descriptive statistics
summary(diamond)
##      price       color  certification
##  Min.   :  638   D:16   GIA:151      
##  1st Qu.: 1625   E:44   HRD: 79      
##  Median : 4215   F:82   IGI: 78      
##  Mean   : 5019   G:65                
##  3rd Qu.: 7446   H:61                
##  Max.   :16008   I:40
#System under test
head(diamond)
##   price color certification
## 1  1302     D           GIA
## 2  1510     E           GIA
## 3  1510     G           GIA
## 4  1260     G           GIA
## 5  1641     D           GIA
## 6  1555     E           GIA
names(diamond)
## [1] "price"         "color"         "certification"

Factors and Levels

The data contains 3 factors and several levels. The as.factors was not necessary because they were categorical.

#Display the structure
str(diamond)
## 'data.frame':    308 obs. of  3 variables:
##  $ price        : int  1302 1510 1510 1260 1641 1555 1427 1427 1126 1126 ...
##  $ color        : Factor w/ 6 levels "D","E","F","G",..: 1 2 4 4 1 2 3 4 5 6 ...
##  $ certification: Factor w/ 3 levels "GIA","HRD","IGI": 1 1 1 1 1 1 1 1 1 1 ...
#Check the levels
levels(diamond$price)
## NULL
levels(diamond$color)
## [1] "D" "E" "F" "G" "H" "I"
levels(diamond$certification)
## [1] "GIA" "HRD" "IGI"

To summarize, the levels are:

color: D,E,F,G,H,I

certification: GIA,IGI,HRD

Continuous Variables (if any)

The data had one continuous variable that was also the dependent variable. It was the price.

Response Variable(s)

The main objective of the experiment was to determine the effects of pricing on diamond. Therefore, the price variable also represents the response variable as well.

The Data: How is it organized and what does it look like?

The data consists of N=308 obversations with 3 variables. The data was collected in the year 2000 and the country the data came from was Singapore.

Here are the organizations and views of the data:

summary(diamond)
##      price       color  certification
##  Min.   :  638   D:16   GIA:151      
##  1st Qu.: 1625   E:44   HRD: 79      
##  Median : 4215   F:82   IGI: 78      
##  Mean   : 5019   G:65                
##  3rd Qu.: 7446   H:61                
##  Max.   :16008   I:40
head(diamond)
##   price color certification
## 1  1302     D           GIA
## 2  1510     E           GIA
## 3  1510     G           GIA
## 4  1260     G           GIA
## 5  1641     D           GIA
## 6  1555     E           GIA
tail(diamond)
##     price color certification
## 303  8873     I           HRD
## 304  8175     I           HRD
## 305 10796     F           HRD
## 306  9890     H           HRD
## 307  8959     H           HRD
## 308  9107     I           HRD

Next, let’s look at the block design.

2. (Experimental) Design

How will the experiment be organized and conducted to test the hypothesis?

To consider for blocking, I needed to organize the data so that I could block on one independent variable and test a hypothesis on the second independent variable. Therefore, I chose certification as the variable to block. The color was the treatment. Take a closer look at the re-organized dataset:

#Organize the data to treatments and block
data = read.csv("block.csv", header = TRUE, sep=",")
data
##     price treatment block
## 1    1302         D     1
## 2    1510         E     1
## 3    1510         G     1
## 4    1260         G     1
## 5    1641         D     1
## 6    1555         E     1
## 7    1427         F     1
## 8    1427         G     1
## 9    1126         H     1
## 10   1126         I     1
## 11   1468         F     1
## 12   1202         G     1
## 13   1327         E     1
## 14   1098         I     1
## 15   1693         E     1
## 16   1551         F     1
## 17   1410         G     1
## 18   1269         G     1
## 19   1316         H     1
## 20   1222         H     1
## 21   1738         E     1
## 22   1593         F     1
## 23   1447         G     1
## 24   1255         H     1
## 25   1635         F     1
## 26   1485         H     1
## 27   1420         F     1
## 28   1420         H     1
## 29   1911         F     1
## 30   1525         H     1
## 31   1956         F     1
## 32   1747         H     1
## 33   1572         I     1
## 34   2942         E     1
## 35   2532         G     1
## 36   3501         E     1
## 37   3501         E     1
## 38   3501         F     1
## 39   3293         F     1
## 40   3016         G     1
## 41   3567         F     1
## 42   3205         G     1
## 43   3490         D     1
## 44   3635         E     1
## 45   3635         F     1
## 46   3418         F     1
## 47   3921         D     1
## 48   3701         F     1
## 49   3480         F     1
## 50   3407         G     1
## 51   3767         E     1
## 52   4066         F     1
## 53   4138         E     1
## 54   3605         F     1
## 55   3529         G     1
## 56   3667         F     1
## 57   2892         I     1
## 58   3651         G     1
## 59   3773         G     1
## 60   4291         F     1
## 61   5845         E     1
## 62   4401         G     1
## 63   4759         G     1
## 64   4300         H     1
## 65   5510         F     1
## 66   5122         G     1
## 67   5122         H     1
## 68   3861         I     1
## 69   5881         F     1
## 70   5586         F     1
## 71   5193         F     1
## 72   5193         H     1
## 73   5263         F     1
## 74   5441         I     1
## 75   4948         I     1
## 76   5705         H     1
## 77   6805         F     1
## 78   6882         H     1
## 79   6709         H     1
## 80   6682         I     1
## 81   3501         E     1
## 82   3432         G     1
## 83   3851         F     1
## 84   3605         H     1
## 85   3900         E     1
## 86   3415         H     1
## 87   4291         H     1
## 88   6512         E     1
## 89   5800         E     1
## 90   6285         F     1
## 91   5122         F     1
## 92   5122         F     1
## 93   5122         G     1
## 94   5122         H     1
## 95   6372         D     1
## 96   5881         E     1
## 97   5193         H     1
## 98   5961         E     1
## 99   5662         H     1
## 100  5738         E     1
## 101  5030         H     1
## 102  5030         H     1
## 103  4727         I     1
## 104  4221         I     1
## 105  5815         G     1
## 106  4585         H     1
## 107  7368         D     1
## 108  4667         I     1
## 109  4355         I     1
## 110  9885         D     1
## 111  6919         F     1
## 112  5386         H     1
## 113  4832         I     1
## 114  7156         E     1
## 115  7680         F     1
## 116 15582         D     1
## 117 11419         D     1
## 118 10588         E     1
## 119  9757         E     1
## 120 13913         F     1
## 121 10588         F     1
## 122 10713         F     1
## 123  9480         F     1
## 124  9896         G     1
## 125  9619         G     1
## 126  9169         G     1
## 127  9203         G     1
## 128  8788         H     1
## 129  8095         I     1
## 130  7818         I     1
## 131 16008         D     1
## 132 10692         E     1
## 133  9853         E     1
## 134 10272         F     1
## 135  9573         F     1
## 136  9153         H     1
## 137  8873         H     1
## 138  8873         I     1
## 139  8455         I     1
## 140  7895         I     1
## 141 10372         F     1
## 142  9666         F     1
## 143 10090         G     1
## 144 10900         E     1
## 145 10571         F     1
## 146  9563         I     1
## 147  8781         I     1
## 148  9743         G     1
## 149  9302         H     1
## 150  8945         I     1
## 151  9646         H     1
## 152   823         F     2
## 153   765         F     2
## 154   803         G     2
## 155   803         G     2
## 156   705         G     2
## 157   725         H     2
## 158   967         D     2
## 159  1050         E     2
## 160   967         F     2
## 161   863         F     2
## 162   800         F     2
## 163   842         G     2
## 164   800         G     2
## 165   758         H     2
## 166   880         D     2
## 167   880         G     2
## 168   705         G     2
## 169   638         G     2
## 170   919         D     2
## 171  1149         E     2
## 172  1057         F     2
## 173   919         G     2
## 174  1198         E     2
## 175  1248         E     2
## 176  1147         F     2
## 177   995         G     2
## 178  1108         H     2
## 179  1485         F     2
## 180  1283         G     2
## 181  1149         H     2
## 182  1082         I     2
## 183  1539         F     2
## 184  1365         F     2
## 185  1260         F     2
## 186  1121         I     2
## 187  1595         F     2
## 188  1233         H     2
## 189  1199         I     2
## 190  1471         G     2
## 191  1238         I     2
## 192  1580         E     2
## 193  1459         F     2
## 194  1459         G     2
## 195  1218         H     2
## 196  1299         I     2
## 197  1628         E     2
## 198  1628         F     2
## 199  1337         I     2
## 200  1462         H     2
## 201  1503         H     2
## 202  1773         F     2
## 203  1636         F     2
## 204  1821         F     2
## 205  1540         G     2
## 206  2276         G     2
## 207  1616         I     2
## 208  1506         I     2
## 209  2651         F     2
## 210  2383         F     2
## 211  3652         G     2
## 212  3722         E     2
## 213  3722         F     2
## 214  3095         I     2
## 215  3706         F     2
## 216  4070         E     2
## 217  3470         G     2
## 218  4831         E     2
## 219  4209         F     2
## 220  3821         G     2
## 221  5607         G     2
## 222  5326         G     2
## 223  6160         D     2
## 224  6095         F     2
## 225  5937         G     2
## 226  9342         H     2
## 227  9713         G     2
## 228  8873         H     2
## 229  8175         I     2
## 230  3778         F     3
## 231  3432         G     3
## 232  3851         F     3
## 233  3346         E     3
## 234  3130         H     3
## 235  3995         F     3
## 236  3701         F     3
## 237  3529         G     3
## 238  3667         F     3
## 239  3202         F     3
## 240  3256         F     3
## 241  3415         H     3
## 242  3792         H     3
## 243  3925         G     3
## 244  3421         G     3
## 245  3925         H     3
## 246  3616         H     3
## 247  3615         I     3
## 248  3785         H     3
## 249  3643         I     3
## 250  4300         H     3
## 251  6867         E     3
## 252  6285         E     3
## 253  5800         G     3
## 254  5510         G     3
## 255  4346         H     3
## 256  6372         G     3
## 257  5193         H     3
## 258  5662         H     3
## 259  5333         F     3
## 260  6041         G     3
## 261  5815         H     3
## 262  8611         F     3
## 263  6905         F     3
## 264  6905         G     3
## 265  6416         H     3
## 266  6051         H     3
## 267  8715         E     3
## 268  6988         E     3
## 269  6988         F     3
## 270  6495         G     3
## 271  7358         H     3
## 272  6572         F     3
## 273  7072         G     3
## 274  8359         F     3
## 275  6805         F     3
## 276  7711         G     3
## 277  5835         H     3
## 278 13775         D     3
## 279 14051         E     3
## 280 11419         E     3
## 281 10588         E     3
## 282 11696         F     3
## 283 10588         F     3
## 284 10450         G     3
## 285  9896         G     3
## 286  9203         G     3
## 287  9480         H     3
## 288  9065         H     3
## 289  8788         H     3
## 290  8788         I     3
## 291  8372         I     3
## 292  8095         I     3
## 293  7818         I     3
## 294 13909         D     3
## 295 11531         E     3
## 296 10692         E     3
## 297 11811         F     3
## 298 10272         F     3
## 299  9993         G     3
## 300  9293         G     3
## 301  9433         H     3
## 302  9153         H     3
## 303  8873         I     3
## 304  8175         I     3
## 305 10796         F     3
## 306  9890         H     3
## 307  8959         H     3
## 308  9107         I     3

Before explaining more in detail, considering results from G*Power, and extending the Null Hypothesis Significance Test (NHST) to two alternatives, I want to outline some key portions from the Recipes for Design of Experiments.

Organization of the Experiment and Testing the Hypothesis Setup

The original dataset for the diamonds case included several factors and response variables. However, the factors were chosen based primarily on the rules of ANOVA and block design, and personal preference with Best Guess in regards to predicting the effect. I needed to have independent variables (IV) that were categorical and dependent variables (DV) that were continuous.

The block design was placed onto one of the pair of the categorical IVs. Based on Best Guess, I blocked the certification because typically the pricing of a diamond depends more on the weight and color, than certification.

#Check the structure of the organized data
str(data)
## 'data.frame':    308 obs. of  3 variables:
##  $ price    : int  1302 1510 1510 1260 1641 1555 1427 1427 1126 1126 ...
##  $ treatment: Factor w/ 6 levels "D","E","F","G",..: 1 2 4 4 1 2 3 4 5 6 ...
##  $ block    : int  1 1 1 1 1 1 1 1 1 1 ...
data$block = factor(data$block)
data$block
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [176] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [246] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [281] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3

In this experiment, the null hypothesis states that there is no difference between the color of the diamond (D,E,F,G,H,I) in regards to the price of the diamonds.

On the contrary, the alternative hypothesis states that there is a difference between the color of dimaond and the price of the diamonds.

For the purpose of this analyis, I chose a: Type I Error (α) of 0.05 because it is the most commonly used in practice for significance. Type II error (β) of 0.1 because from the discussions in class, we learned that a power (1-β) of 0.8 or greater is preferred. An increase in power means that there needs to be more sample size as well.

What is the rationale for this design?

The rationale for this design was limited as I adapted a pre-existing experiment design dataset. The original factors and procedures to collect the data were done by the experimentor. Therefore, I had less flexibility and control over some aspects

However, the rationale for the block design was purely a simple case of a blocked design and study the results by performing ANOVA for the particular factor.

Randomize: What is the Randomization Scheme?

First of all, I assume the data collected was random because the diamond stores were chosen randomly and at a random time frame of the year 2000, including day and time.

Also, to ensure further randomization, I randomized the order of the data:

#Randomize the data
index = sample(1:nrow(diamond), 308, replace=FALSE)
rdata = diamond[index,]
rdata
##     price color certification
## 99   5662     H           GIA
## 58   3651     G           GIA
## 130  7818     I           GIA
## 144 10900     E           GIA
## 267  8715     E           HRD
## 66   5122     G           GIA
## 226  9342     H           IGI
## 205  1540     G           IGI
## 92   5122     F           GIA
## 127  9203     G           GIA
## 67   5122     H           GIA
## 26   1485     H           GIA
## 42   3205     G           GIA
## 10   1126     I           GIA
## 153   765     F           IGI
## 253  5800     G           HRD
## 225  5937     G           IGI
## 215  3706     F           IGI
## 186  1121     I           IGI
## 298 10272     F           HRD
## 223  6160     D           IGI
## 162   800     F           IGI
## 133  9853     E           GIA
## 249  3643     I           HRD
## 132 10692     E           GIA
## 126  9169     G           GIA
## 272  6572     F           HRD
## 13   1327     E           GIA
## 236  3701     F           HRD
## 137  8873     H           GIA
## 279 14051     E           HRD
## 154   803     G           IGI
## 281 10588     E           HRD
## 71   5193     F           GIA
## 8    1427     G           GIA
## 45   3635     F           GIA
## 178  1108     H           IGI
## 231  3432     G           HRD
## 103  4727     I           GIA
## 214  3095     I           IGI
## 173   919     G           IGI
## 28   1420     H           GIA
## 84   3605     H           GIA
## 156   705     G           IGI
## 80   6682     I           GIA
## 147  8781     I           GIA
## 114  7156     E           GIA
## 303  8873     I           HRD
## 34   2942     E           GIA
## 185  1260     F           IGI
## 77   6805     F           GIA
## 247  3615     I           HRD
## 234  3130     H           HRD
## 300  9293     G           HRD
## 189  1199     I           IGI
## 227  9713     G           IGI
## 240  3256     F           HRD
## 283 10588     F           HRD
## 169   638     G           IGI
## 48   3701     F           GIA
## 25   1635     F           GIA
## 170   919     D           IGI
## 270  6495     G           HRD
## 22   1593     F           GIA
## 192  1580     E           IGI
## 78   6882     H           GIA
## 199  1337     I           IGI
## 206  2276     G           IGI
## 194  1459     G           IGI
## 79   6709     H           GIA
## 187  1595     F           IGI
## 230  3778     F           HRD
## 97   5193     H           GIA
## 52   4066     F           GIA
## 202  1773     F           IGI
## 255  4346     H           HRD
## 286  9203     G           HRD
## 87   4291     H           GIA
## 32   1747     H           GIA
## 269  6988     F           HRD
## 211  3652     G           IGI
## 198  1628     F           IGI
## 295 11531     E           HRD
## 246  3616     H           HRD
## 258  5662     H           HRD
## 129  8095     I           GIA
## 38   3501     F           GIA
## 95   6372     D           GIA
## 209  2651     F           IGI
## 39   3293     F           GIA
## 72   5193     H           GIA
## 158   967     D           IGI
## 117 11419     D           GIA
## 229  8175     I           IGI
## 274  8359     F           HRD
## 23   1447     G           GIA
## 165   758     H           IGI
## 280 11419     E           HRD
## 36   3501     E           GIA
## 91   5122     F           GIA
## 157   725     H           IGI
## 265  6416     H           HRD
## 37   3501     E           GIA
## 17   1410     G           GIA
## 53   4138     E           GIA
## 31   1956     F           GIA
## 305 10796     F           HRD
## 273  7072     G           HRD
## 308  9107     I           HRD
## 4    1260     G           GIA
## 212  3722     E           IGI
## 122 10713     F           GIA
## 74   5441     I           GIA
## 278 13775     D           HRD
## 69   5881     F           GIA
## 51   3767     E           GIA
## 70   5586     F           GIA
## 35   2532     G           GIA
## 85   3900     E           GIA
## 110  9885     D           GIA
## 148  9743     G           GIA
## 124  9896     G           GIA
## 152   823     F           IGI
## 113  4832     I           GIA
## 174  1198     E           IGI
## 27   1420     F           GIA
## 159  1050     E           IGI
## 140  7895     I           GIA
## 261  5815     H           HRD
## 151  9646     H           GIA
## 177   995     G           IGI
## 88   6512     E           GIA
## 11   1468     F           GIA
## 60   4291     F           GIA
## 90   6285     F           GIA
## 252  6285     E           HRD
## 102  5030     H           GIA
## 96   5881     E           GIA
## 166   880     D           IGI
## 275  6805     F           HRD
## 111  6919     F           GIA
## 210  2383     F           IGI
## 141 10372     F           GIA
## 107  7368     D           GIA
## 150  8945     I           GIA
## 65   5510     F           GIA
## 183  1539     F           IGI
## 241  3415     H           HRD
## 9    1126     H           GIA
## 135  9573     F           GIA
## 61   5845     E           GIA
## 200  1462     H           IGI
## 291  8372     I           HRD
## 93   5122     G           GIA
## 1    1302     D           GIA
## 294 13909     D           HRD
## 213  3722     F           IGI
## 248  3785     H           HRD
## 276  7711     G           HRD
## 271  7358     H           HRD
## 59   3773     G           GIA
## 196  1299     I           IGI
## 266  6051     H           HRD
## 167   880     G           IGI
## 233  3346     E           HRD
## 145 10571     F           GIA
## 55   3529     G           GIA
## 237  3529     G           HRD
## 293  7818     I           HRD
## 146  9563     I           GIA
## 49   3480     F           GIA
## 190  1471     G           IGI
## 219  4209     F           IGI
## 155   803     G           IGI
## 235  3995     F           HRD
## 40   3016     G           GIA
## 197  1628     E           IGI
## 131 16008     D           GIA
## 29   1911     F           GIA
## 260  6041     G           HRD
## 108  4667     I           GIA
## 16   1551     F           GIA
## 175  1248     E           IGI
## 285  9896     G           HRD
## 82   3432     G           GIA
## 217  3470     G           IGI
## 160   967     F           IGI
## 30   1525     H           GIA
## 222  5326     G           IGI
## 243  3925     G           HRD
## 64   4300     H           GIA
## 250  4300     H           HRD
## 263  6905     F           HRD
## 224  6095     F           IGI
## 245  3925     H           HRD
## 306  9890     H           HRD
## 301  9433     H           HRD
## 143 10090     G           GIA
## 100  5738     E           GIA
## 242  3792     H           HRD
## 277  5835     H           HRD
## 7    1427     F           GIA
## 264  6905     G           HRD
## 56   3667     F           GIA
## 244  3421     G           HRD
## 21   1738     E           GIA
## 207  1616     I           IGI
## 239  3202     F           HRD
## 168   705     G           IGI
## 184  1365     F           IGI
## 296 10692     E           HRD
## 191  1238     I           IGI
## 268  6988     E           HRD
## 68   3861     I           GIA
## 136  9153     H           GIA
## 176  1147     F           IGI
## 18   1269     G           GIA
## 180  1283     G           IGI
## 179  1485     F           IGI
## 50   3407     G           GIA
## 98   5961     E           GIA
## 57   2892     I           GIA
## 5    1641     D           GIA
## 232  3851     F           HRD
## 3    1510     G           GIA
## 89   5800     E           GIA
## 238  3667     F           HRD
## 83   3851     F           GIA
## 2    1510     E           GIA
## 104  4221     I           GIA
## 128  8788     H           GIA
## 171  1149     E           IGI
## 218  4831     E           IGI
## 257  5193     H           HRD
## 116 15582     D           GIA
## 101  5030     H           GIA
## 282 11696     F           HRD
## 208  1506     I           IGI
## 119  9757     E           GIA
## 181  1149     H           IGI
## 204  1821     F           IGI
## 220  3821     G           IGI
## 106  4585     H           GIA
## 112  5386     H           GIA
## 193  1459     F           IGI
## 216  4070     E           IGI
## 20   1222     H           GIA
## 73   5263     F           GIA
## 46   3418     F           GIA
## 299  9993     G           HRD
## 290  8788     I           HRD
## 138  8873     I           GIA
## 12   1202     G           GIA
## 161   863     F           IGI
## 163   842     G           IGI
## 63   4759     G           GIA
## 289  8788     H           HRD
## 118 10588     E           GIA
## 256  6372     G           HRD
## 44   3635     E           GIA
## 43   3490     D           GIA
## 262  8611     F           HRD
## 302  9153     H           HRD
## 81   3501     E           GIA
## 62   4401     G           GIA
## 304  8175     I           HRD
## 297 11811     F           HRD
## 254  5510     G           HRD
## 292  8095     I           HRD
## 287  9480     H           HRD
## 47   3921     D           GIA
## 142  9666     F           GIA
## 134 10272     F           GIA
## 109  4355     I           GIA
## 94   5122     H           GIA
## 172  1057     F           IGI
## 228  8873     H           IGI
## 139  8455     I           GIA
## 221  5607     G           IGI
## 19   1316     H           GIA
## 188  1233     H           IGI
## 201  1503     H           IGI
## 41   3567     F           GIA
## 14   1098     I           GIA
## 251  6867     E           HRD
## 123  9480     F           GIA
## 24   1255     H           GIA
## 76   5705     H           GIA
## 6    1555     E           GIA
## 307  8959     H           HRD
## 86   3415     H           GIA
## 105  5815     G           GIA
## 149  9302     H           GIA
## 288  9065     H           HRD
## 54   3605     F           GIA
## 203  1636     F           IGI
## 120 13913     F           GIA
## 75   4948     I           GIA
## 33   1572     I           GIA
## 284 10450     G           HRD
## 164   800     G           IGI
## 121 10588     F           GIA
## 15   1693     E           GIA
## 195  1218     H           IGI
## 182  1082     I           IGI
## 125  9619     G           GIA
## 259  5333     F           HRD
## 115  7680     F           GIA

Therefore, there were 3 levels of randomization: measurement, assignment, and performing.

Eeplicate: Are there replicates and/or repeated measures?

Due to the simplification of the dataset, which was a cross-section during 2000, I cannot verify if there were any replicates and/or repeated measures.

Block: Did you use blocking in the design?

As mentioned previously, the design definitely includes blocking. The IV factor (certification) was blocked to reduce the nuisance factor in how the certification process may vary among the institutions due to differing perspectives and industry standards, while color of a diamond can be viewed more or less with less conflicts and consensus. An ANOVA test was examined, with the corresponding results for certification neglected.

3. (Statistical) Analysis

(Exploratory Data Analysis) Graphics and descriptive summary

Below is the descriptive summary of the data.

#Display the descriptive statistics
summary(diamond)
##      price       color  certification
##  Min.   :  638   D:16   GIA:151      
##  1st Qu.: 1625   E:44   HRD: 79      
##  Median : 4215   F:82   IGI: 78      
##  Mean   : 5019   G:65                
##  3rd Qu.: 7446   H:61                
##  Max.   :16008   I:40

Also, to take a closer look at the general overview of the data, I plotted a histogram of the price of diamonds. The histogram seems to have a right-skew and mostly clustered within the approximate range of 0 and 5000.

#Histogram and Boxplot of Price
hist(diamond$price, main="Histogram of Diamond Prices")

boxplot(diamond$price, main="Boxplot of Diamond Prices")

You can see the different certifications labelled as G, I, and H.

#Plot for Certification
plot(diamond$price, pch=as.character(diamond$certification))

From this graphic, the IGI certification seemed to be not have as many expensive diamonds as the other certifications.

boxplot(diamond$price~diamond$certification, main = "Price vs Certification", xlab = "Price in Singapore $",ylab="Certification")

Testing

The Main Effect

The me was calculated in a similar fashion as the previous project. Here, I considered the non-blocking IV, which was the color.

#Calculating the main effect to get the effect size (later confirm with G*Power)
#Take the difference between the means of the non-blocking IV (color)

m1 <- mean(subset(diamond$price, diamond$color=="D"))
m2 <- mean(subset(diamond$price, diamond$color=="E"))
m3 <- mean(subset(diamond$price, diamond$color=="F"))
m4 <- mean(subset(diamond$price, diamond$color=="G"))
m5 <- mean(subset(diamond$price, diamond$color=="H"))
m6 <- mean(subset(diamond$price, diamond$color=="I"))

me <- m6 - m5 - m4 - m3 - m2 - m1

#Print the result
me
## [1] -21610.39

me = -21610.39

There was a difference between the means. A boxplot below was plotted to elaborate on the me.

#Boxplots
boxplot(diamond$price~diamond$color, main = "Price vs Color", xlab = "Price in Singapore $",ylab="Color")

First of all, from the boxplot, there were no outliers present. Also, the median for D was higher than most of the other colors, followed by H,I,E,F, and then G. F and G were fairly close, but the variance or range differed.

G*Power to find effect size and sample size N

Effect size by hand

I first found the effect size by hand using R, and then later confirmed it with G*Power. The equation to compute the effect size was straightforward because it was taking the differences of the averages and divided by the standard deviation.

m1 <- mean(diamond$color=="D")
m2 <- mean(diamond$color=="E")
m3 <- mean(diamond$color=="F")
m4 <- mean(diamond$color=="G")
m5 <- mean(diamond$color=="H")
m6 <- mean(diamond$color=="I")

diff_mean = m6 - m5 - m4 - m3 -m2 -m1
sd1 <- sd(diamond$color=="D")
sd2 <- sd(diamond$color=="E")
sd3 <- sd(diamond$color=="F")
sd4 <- sd(diamond$color=="G")
sd5 <- sd(diamond$color=="H")
sd6 <- sd(diamond$color=="I")

sd = (sd1+sd2+sd3+sd4+sd5+sd6)/2

result = abs(diff_mean/sd)
result
## [1] 0.6853988

Effect size using GPower On GPower, I set the ‘Test Family’ to F-tests and ANOVA (Fixed effects, omnibus, one-way). Then, I chose Sensitivity to compute the required effect size.

As mentioned before, Type I Error (α) = 0.05 Type II error (β) = 0.1.

Effect size = 0.6853988

Determining the sample size N

On G*Power, I kept the other conditions the same, except the ‘Type of power analysis’ to A Priori.

The following parameters were added into G*Power:

Effect Size = 0.6853988 Type I Error (α) = 0.05 Power = 0.9 Number of Groups = 3

Below is the screenshot:

Effect Size

Effect Size

From the G*Power output, I found that the critical F-statistics was 3.0253 and the sample size N needed to be 309. N=309 would be the amount needed to achieve the desired power.

Sample Size N=33

Now that I got the required sample size needed, I decided to randomly select from the data.

#Set the right sample size
N = 33
set.seed(8)

#rdiamond = new diamond data
rdiamond = diamond[sample(nrow(diamond),N),]

ANOVA

First of all, I wanted to show the ANOVA result that was before the selected N experimental cases at random.

Remember: treatment: color block: certification (3 groups)

#Randomized Complete Block Design with ANOVA Analysis
fit.lm = lm(price ~ treatment+block, data=data)
anova(fit.lm)
## Analysis of Variance Table
## 
## Response: price
##            Df     Sum Sq   Mean Sq F value  Pr(>F)    
## treatment   5  108229779  21645956   2.638 0.02364 *  
## block       2  985593642 492796821  60.058 < 2e-16 ***
## Residuals 300 2461603926   8205346                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit.lm)
## 
## Call:
## lm(formula = price ~ treatment + block, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6286.7 -1604.7  -557.2  1675.7  8765.2 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7588.7      727.3  10.435  < 2e-16 ***
## treatmentE   -1936.6      837.2  -2.313 0.021384 *  
## treatmentF   -2441.0      784.4  -3.112 0.002037 ** 
## treatmentG   -2689.6      802.9  -3.350 0.000912 ***
## treatmentH   -2873.3      809.5  -3.550 0.000448 ***
## treatmentI   -2129.4      848.3  -2.510 0.012589 *  
## block2       -2978.4      402.9  -7.392 1.45e-12 ***
## block3        2045.9      401.0   5.102 5.98e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2864 on 300 degrees of freedom
## Multiple R-squared:  0.3076, Adjusted R-squared:  0.2915 
## F-statistic: 19.04 on 7 and 300 DF,  p-value: < 2.2e-16
#Plots 
plot(fit.lm)

interaction.plot(data$treatment, data$block, data$price)

Using the sample size, N=33, I used the ndiamond. The 2-way ANOVA test was then performed. Since the certification was blocked, I ignored the F statistics.

#ANOVA
anova = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(anova)
##                        Df    Sum Sq  Mean Sq F value Pr(>F)  
## rdiamond$color          5 106615419 21323084   2.866 0.0352 *
## rdiamond$certification  2  52942085 26471043   3.558 0.0437 *
## Residuals              25 185998031  7439921                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The null hypothesis was that there is no difference between the color of the diamond in regards to the price of the diamonds. The results from the certification would be ignored because it was the blocking variable. From the ANOVA results, the F-statistic for the treatment (color) was 2.866 and the p-value was 0.0352. The alpha value was set to 0.05. The p-value was much less than the alpha value and the F value was not close to 0.

Therefore, since the p-value < α, we reject the null hypothesis, and the result was statistically significant. The certification effect was not the research question, so it was neglected.

Two Alternatives to NHST

Null Hypothesis Statistical Testing (NHST) was explored to practice the skills and apply them to the experiment.

Multiple Models

Multiple models are used to minimize the importance of the need to reject the null hypothesis. It specifies how well the data fits the set of models. The data are not to be assumed that the they would be derived directly from the null hypothesis. In this case where the outcome was significant, I want to find a family of model that shows the opposite outcome, so insignificant. Since I had significance, I want to find insignificance. The purpose of multiple models is to show that the results are truly significant or insignificant (depending on your results) by showing a model or family of models for which I would not get the same significant results.

From the original model, I found that the results were significant.

I first looked at the interactions.

#Check Interaction 
mm1 = aov(rdiamond$price ~ rdiamond$color*rdiamond$certification)
summary(mm1)
##                                       Df    Sum Sq  Mean Sq F value Pr(>F)
## rdiamond$color                         5 106615419 21323084   2.590 0.0580
## rdiamond$certification                 2  52942085 26471043   3.215 0.0615
## rdiamond$color:rdiamond$certification  5  21341985  4268397   0.518 0.7593
## Residuals                             20 164656046  8232802               
##                                        
## rdiamond$color                        .
## rdiamond$certification                .
## rdiamond$color:rdiamond$certification  
## Residuals                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The interactions showed some significance, but the interaction was fairly insignificant. This strengthens the assumption that there was no interaction between the blocking and response variable.

Then, I looked at the case, where I transformed the outcome of the Y by increasing the power to 2.

rdiamond$price = (rdiamond$price)^2
mm2= aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm2)
##                        Df    Sum Sq   Mean Sq F value Pr(>F)  
## rdiamond$color          5 1.219e+16 2.438e+15   2.117 0.0967 .
## rdiamond$certification  2 4.161e+15 2.081e+15   1.807 0.1850  
## Residuals              25 2.879e+16 1.152e+15                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here, I noticed that the results were insignificant.

I tried more cases. For this, I multiplied the coefficient 4.

rdiamond$price = (rdiamond$price)*4
mm3 = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm3)
##                        Df    Sum Sq   Mean Sq F value Pr(>F)  
## rdiamond$color          5 1.950e+17 3.900e+16   2.117 0.0967 .
## rdiamond$certification  2 6.658e+16 3.329e+16   1.807 0.1850  
## Residuals              25 4.607e+17 1.843e+16                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The results were insignificant.

I also looked into log.

N = 33
mdiamond = diamond[sample(nrow(diamond),N),]
mdiamond$price = log(mdiamond$price)
mm4 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification)
summary(mm4)
##                        Df Sum Sq Mean Sq F value Pr(>F)  
## mdiamond$color          4  1.359  0.3398   0.740 0.5735  
## mdiamond$certification  2  3.714  1.8572   4.043 0.0296 *
## Residuals              26 11.944  0.4594                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interestingly, the results were insignificant, while the certification showed significance.

Next, I looked into squaring the certification.

mdiamond$certification2 = as.integer(mdiamond$certification)^2
mm5 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification2)
summary(mm5)
##                         Df Sum Sq Mean Sq F value Pr(>F)
## mdiamond$color           4  1.359  0.3398   0.586  0.675
## mdiamond$certification2  1  0.003  0.0030   0.005  0.943
## Residuals               27 15.655  0.5798

This made the results insignificant as well.

Finally, I considered other fixed effect models, where I added a new factor, clarity.

test = read.csv("nmodel.csv", header = TRUE, sep=",")
N = 33
nm = test[sample(nrow(test),N),]
mm6 = aov(test$price ~ test$clarity+test$certification)
summary(mm6)
##                     Df    Sum Sq   Mean Sq F value   Pr(>F)    
## test$clarity         4 2.997e+08  74917212   8.826 9.46e-07 ***
## test$certification   2 7.009e+08 350460667  41.290  < 2e-16 ***
## Residuals          301 2.555e+09   8487831                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This time, the results were significant. The clarity seemed to have affected the price of diamond a lot. From an intuitive point of view, this makes sense.

Therefore, with multiple models I was able to show different family of models that predicted no difference, positive difference, and negative difference. In other words, I was able to show that other models can make the results insignificant, contrast to my significance.

Confidence Intervals

The second NHST I considered was confidence intervals. Confidence intervals are a medium or tool to assess the probability that a parameter lies within a certain range of values. It is used to appraise results found. To interpret confidence intervals, I would use to see where the values fall under this particular range of values. It estimates the degree to which the values is most likely going to be the true value. I prefer confidence intervals over effect size because it can replace significance testing.

First, I used confint to see the confidence interval.

confint(anova)
##                                2.5 %     97.5 %
## (Intercept)               -4315.6425  6919.6425
## rdiamond$colorE             750.8011 12825.7181
## rdiamond$colorF           -2575.6021  9585.2104
## rdiamond$colorG           -2337.4214 10092.1483
## rdiamond$colorH           -1273.2912 10817.0493
## rdiamond$colorI           -1135.7277 13079.3409
## rdiamond$certificationHRD -2768.7008  2674.0127
## rdiamond$certificationIGI -6608.6126  -798.3339

Then, I used Tukey’s Method. Tukey’s Method creates confidence intervals for paired difference among factor levels and helps control the error rate, which I specified as 0.05.

tukey = TukeyHSD(anova)
plot(tukey)

TukeyHSD(anova)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = rdiamond$price ~ rdiamond$color + rdiamond$certification)
## 
## $`rdiamond$color`
##           diff       lwr        upr     p adj
## E-D  6252.4286 -2733.905 15238.7617 0.2981618
## F-D  1853.5556 -7007.088 10714.1995 0.9862680
## G-D  3127.2000 -6081.051 12335.4513 0.8973377
## H-D  4754.1250 -4161.726 13669.9760 0.5793071
## I-D  2268.3333 -7438.016 11974.6825 0.9776018
## F-E -4398.8730 -8635.071  -162.6749 0.0385353
## G-E -3125.2286 -8047.246  1796.7888 0.3935379
## H-E -1498.3036 -5848.793  2852.1863 0.8918532
## I-E -3984.0952 -9784.748  1816.5579 0.3111946
## G-F  1273.6444 -3414.968  5962.2565 0.9575113
## H-F  2900.5694 -1183.986  6985.1245 0.2779638
## I-F   414.7778 -5189.186  6018.7411 0.9999037
## H-G  1626.9250 -3165.201  6419.0509 0.8974604
## I-G  -858.8667 -6997.701  5279.9676 0.9978711
## I-H -2485.7917 -8176.643  3205.0600 0.7571489
## 
## $`rdiamond$certification`
##               diff       lwr       upr    p adj
## HRD-GIA  -194.6715 -3397.414 3008.0710 0.987449
## IGI-GIA -2237.9506 -5011.607  535.7057 0.130649
## IGI-HRD -2043.2791 -5624.054 1537.4958 0.345518

The confidence interval for the certification contained zero. A zero in a confidence interval range means that there is no difference. In other words, the confidence interval that contained zero means that there is no statistical significance. For my case, there were inconsistencies and consistencies depending on the family model I looked with my ANOVA tests because certification was not significant in some.

Estimation

I estimated the estimation of the parameters for the N=33.

fit.lm2 = lm(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(fit.lm2)
## 
## Call:
## lm(formula = rdiamond$price ~ rdiamond$color + rdiamond$certification)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -233227043  -87657323   -5532014   65925768  237594233 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                  6780816  135746198   0.050   0.9606  
## rdiamond$colorE            275474231  145890743   1.888   0.0707 .
## rdiamond$colorF            116416918  146928544   0.792   0.4356  
## rdiamond$colorG            135329147  150175704   0.901   0.3761  
## rdiamond$colorH            174864663  146077092   1.197   0.2425  
## rdiamond$colorI            219826615  171748338   1.280   0.2123  
## rdiamond$certificationHRD  -16358766   65759585  -0.249   0.8056  
## rdiamond$certificationIGI -133199705   70200556  -1.897   0.0694 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 135700000 on 25 degrees of freedom
## Multiple R-squared:  0.3622, Adjusted R-squared:  0.1836 
## F-statistic: 2.028 on 7 and 25 DF,  p-value: 0.0913

Note: [Some outputs were deleted.]

coef(anova)
##               (Intercept)           rdiamond$colorE 
##                1302.00000                6788.25961 
##           rdiamond$colorF           rdiamond$colorG 
##                3504.80412                3877.36346 
##           rdiamond$colorH           rdiamond$colorI 
##                4771.87901                5971.80659 
## rdiamond$certificationHRD rdiamond$certificationIGI 
##                 -47.34404               -3703.47326

Diagnostics / Model Adequacy Checking

To check if the normality assumption holds, I used the qqnorm and qqlines.

#Check for Normality
qqnorm(residuals(fit.lm2))
qqline(residuals(fit.lm2))

#Residual Plot
plot(fitted(anova),residuals(anova))

Based on the qqnorm and qqline, the qqnorm line looks fairly linear and closely fits the qqline drawn. Also, the residual plot showed an evenly distributed residual as a function of fitted values. Also, there was a good scatter and not close to 0. The pattern seemed random and evenly distributed, two important conditions that satisfy the normality check. Therefore, the normality assumption on the IV was satisfied because the residuals were evenly distributed and the qqnorm line is linear enough. The normality check passed. There is no need to do transformation because the normality assumption was passed.

Summary: Overall, the experiment finding was an interesting one. Based on the p-values, we learned that the color of the diamond does affect the price. The study focused on a blocked design of certification. Therefore, we rejected the null hypothesis. There was a difference between the color of the diamond in regards to the price of the diamonds. However, by exploring different family of models through multiple models, we were able to see changes in significance. In general, the color of the diamond seemed to have an effect on the price of the diamond.

In terms of the model, I would say this was a mixed effects model because there was randomization from the data collection and the variables were selected. However, some might argue it was a fixed effects model because the data might not have represented the general population.

For the findings, it was interesting to explore the multiple models.

The analysis made sense and the block design was well executed.

4. References to the literature

  1. http://www.stat.purdue.edu/~xuanyaoh/stat350/xyMar26Lec23.pdf

  2. http://people.stat.sfu.ca/~thompson/stat403-650/randomized_block.html

  3. http://www.r-tutor.com/elementary-statistics/analysis-variance/randomized-block-design

  4. https://vincentarelbundock.github.io/Rdatasets/doc/Ecdat/Diamond.html

  5. http://www.stat.wisc.edu/~ane/st572/notes/lec24.pdf

  6. https://statistics.laerd.com/spss-tutorials/multiple-regression-using-spss-statistics.php

5. Appendices

#Benjamin Byeon
#10/30/16
#Section 1

#Load the data
diamond = read.csv("diamonds.csv", header = TRUE, sep=",")
diamond

#System under test
head(diamond)
tail(diamond)
names(diamond)

#Display the structure
str(diamond)

#Check the levels
levels(diamond$price)
levels(diamond$color)
levels(diamond$certification)

#Randomize the data
index = sample(1:nrow(diamond), 308, replace=FALSE)
rdata = diamond[index,]
rdata 

#Display the descriptive statistics
summary(diamond)

#Histogram
hist(diamond$price, main="Histogram of Diamond Prices")

#Plot for certification
plot(diamond$price, pch=as.character(diamond$certification))

#Boxplots
boxplot(diamond$price~diamond$color, main = "Price vs Color", xlab = "Price in Singapore $",ylab="Color")
boxplot(diamond$price~diamond$certification, main = "Price vs Certification", xlab = "Price in Singapore $",ylab="Certification")



#Calculating the main effect to get the effect size (rather confirm with G*Power)
#Take the difference between the means of the non-blocking IV (color)
m1 <- mean(subset(diamond$price, diamond$color=="D"))
m2 <- mean(subset(diamond$price, diamond$color=="E"))
m3 <- mean(subset(diamond$price, diamond$color=="F"))
m4 <- mean(subset(diamond$price, diamond$color=="G"))
m5 <- mean(subset(diamond$price, diamond$color=="H"))
m6 <- mean(subset(diamond$price, diamond$color=="I"))

me <- m6 - m5 - m4 - m3 - m2 - m1

#Print the result
me

#Effect Size
m1 <- mean(diamond$color=="D")
m2 <- mean(diamond$color=="E")
m3 <- mean(diamond$color=="F")
m4 <- mean(diamond$color=="G")
m5 <- mean(diamond$color=="H")
m6 <- mean(diamond$color=="I")

diff_mean = m6 - m5 - m4 - m3 -m2 -m1
sd1 <- sd(diamond$color=="D")
sd2 <- sd(diamond$color=="E")
sd3 <- sd(diamond$color=="F")
sd4 <- sd(diamond$color=="G")
sd5 <- sd(diamond$color=="H")
sd6 <- sd(diamond$color=="I")

sd = (sd1+sd2+sd3+sd4+sd5+sd6)/2

result = abs(diff_mean/sd)
result


#Organize the data to treatments and block
data = read.csv("block.csv", header = TRUE, sep=",")
data

#Check the structure of the organized data
str(data)
data$block = factor(data$block)
data$block

#Randomized Complete Block Design with ANOVA Analysis
fit.lm = lm(price ~ treatment+block, data=data)
anova(fit.lm)

#Plots 
plot(fit.lm)
interaction.plot(data$treatment, data$block, data$price)


#Set the right sample size
N = 33
set.seed(8)

#rdiamond = new diamond data
rdiamond = diamond[sample(nrow(diamond),N),]

#ANOVA
anova = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(anova)


#Multiple models

#Check Interaction 
mm1 = aov(rdiamond$price ~ rdiamond$color*rdiamond$certification)
summary(mm1)

#Insignificant - original was significant 
rdiamond$price = (rdiamond$price)^2
mm2= aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm2)

#Insignificant 
rdiamond$price = (rdiamond$price)*4
mm3 = aov(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(mm3)

#Insignificant
N = 33
mdiamond = diamond[sample(nrow(diamond),N),]
mdiamond$price = log(mdiamond$price)
mm4 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification)
summary(mm4)

#Insignificant
mdiamond$certification2 = as.integer(mdiamond$certification)^2
mm5 = aov(mdiamond$price ~ mdiamond$color+mdiamond$certification2)
summary(mm5)


#Other fixed effect model - Significant 
test = read.csv("nmodel.csv", header = TRUE, sep=",")
N = 33
nm = test[sample(nrow(test),N),]
mm6 = aov(test$price ~ test$clarity+test$certification)
summary(mm6)

#Confidence Interval
confint(anova1)
tukey = TukeyHSD(anova1)
plot(tukey)
TukeyHSD(anova)

#Estimation
fit.lm2 = lm(rdiamond$price ~ rdiamond$color+rdiamond$certification)
summary(fit.lm2)
coef(anova)

#Check for Normality
qqnorm(residuals(fit.lm2))
qqline(residuals(fit.lm2))

#Residual Plot
plot(fitted(anova),residuals(anova))