Part 1: Data Cleaning

d = read.csv("janiszewski_rep_exercise.csv", header=T)
  1. Looking at what variables matter
d= d[,c("WorkerId", "SubmitTime", "Input.condition", "Input.price1", "Input.price2", "Input.price3", "Answer.dog_cost","Answer.plasma_cost", "Answer.sushi_cost")]
  1. Looking for Repeats and Removing
#summary statistics
summary(d$WorkerId) #repeats
## A10316ZXDCW4TT A14U5M64AK4MAR A16VNOCOSQS8H6 A17FFPDVAJ2SQJ A18L9BY7DNA8D3 
##              1              1              1              1              1 
## A1A70X1I06UZKH A1DCWHY56LLX11 A1F0K96LA51ICQ A1HQRT1UG6ERGL A1IAC1SAII8YSO 
##              1              1              1              1              1 
## A1JGZFWWT1KMVV A1OKTCXWVYNXE6 A1OYEERWIFE1E2 A1PYG25XQ6H75Z A1SFABJ4NX5DFY 
##              1              1              1              1              1 
## A1TQB6QX4RKBJV A1VYX5VKZ0CTWU A1W8FRDIXULFCW A1X4S0LBLQ5ICD A1YGOYNWW3Q9SQ 
##              1              3              1              1              1 
## A1YQ5UO86XM73Z A1ZLYD5GP1FLYL A21JRS3L6PMJ5Q A21WM0TU34VX0H A228LSM1VQ7GKY 
##              1              1              1              1              1 
## A26B18YJBK0RF5 A28GTG78A8VHA8 A29JHEW30K9LIU A2AMJWFVBY4TOX A2AVFNOKFHQ2ME 
##              1              1              1              1              1 
## A2B9QSYOSS5YTM A2BE9ZY7SQJUI1 A2CGAOF4G65D67 A2CJ265G7KDQPQ A2CQ6N923E0AHA 
##              1              1              1              1              1 
## A2EUAY8BZ2H272 A2FNO46BROC6MW A2FZNTF2DA43I0 A2GREHAB3I8YTW A2HU0HN75JRT9J 
##              1              1              1              1              1 
## A2JKCS6UCAUARS A2NFI8WZXF9JYD A2P1GEHANE46W1 A2PDWVFT2AVEW9 A2TL18J7K3G2A3 
##              1              1              1              1              1 
## A2UOKD7ZRDQA0K A2ZJ282IMK4033 A36BYL3PXBRQHR A3800T0V4VADFI A38E9ET5TC41WD 
##              1              1              1              1              1 
## A3AJLUNBK4EU68 A3CWGT3EMQ17Y0 A3D5C3AR576B1T A3EFN4B29IY00A A3G07E5QWOJTAR 
##              1              1              1              1              1 
## A3G4OOZ25NZU3I A3HMBHM8HJLKRD A3IM0Z8F01XLE6 A3JQS4QJ5YMWZX A3LBUWPBL3FV59 
##              1              1              1              1              1 
## A3LQAX287GQ35Q A3NBCP9NSX3670 A3NS3B2BFI6AAO A3O97JQ9J6RHLM A3OWHW7XYQU52K 
##              1              1              1              1              1 
## A3R1YTYGRQEZU3 A3S99S9NJ6X08J  A4X4GQMK34FGB  A54HH8I8XLIEP  A5IUO2T5MZQUZ 
##              1              1              1              1              2 
##  A9J48W1C9IZAI  AAOA3WLON3GKX  AB6C1SZWVNQCC  ACSZ7IOG3J5NR  AEA1MB8LFXEGH 
##              1              1              1              1              1 
##  AFODO5KH3V4EN  AG5OJVGC1ZH8Q  AHV4U78TUUDKI  AI2GXFO5FLLZ1  AKDETA3O440J7 
##              1              1              1              1              1 
##  AMBTRKSCFZU2M  AN6NSLCHI8SY4  AR3I3KJIGB4U6  AT2SP7P5DXNUI  ATOES4DDGVTGZ 
##              1              1              1              1              1 
##  AW6DOBYV5GVN6  AWR6MOBX9YSOM 
##              1              1
d$WorkerId[duplicated(d$WorkerId)==T] # A1VYX5VKZ0CTWU A5IUO2T5MZQUZ have duplicates
## [1] A1VYX5VKZ0CTWU A5IUO2T5MZQUZ  A1VYX5VKZ0CTWU
## 87 Levels: A10316ZXDCW4TT A14U5M64AK4MAR A16VNOCOSQS8H6 ... AWR6MOBX9YSOM
#I am going to keep the earliest ones of each of these.

d$SubmitTime[d$WorkerId=="A1VYX5VKZ0CTWU"] #later ones are Thu Jan 26 20:45:26 GMT 2012 Thu Jan 26 20:45:55 GMT 2012
## [1] Thu Jan 26 20:44:20 GMT 2012 Thu Jan 26 20:45:26 GMT 2012
## [3] Thu Jan 26 20:45:55 GMT 2012
## 90 Levels: Fri Jan 27 00:40:50 GMT 2012 ... Wed Jan 25 23:50:08 GMT 2012
d$SubmitTime[d$WorkerId=="A5IUO2T5MZQUZ"] #later one is Thu Jan 26 23:50:11 GMT 2012
## [1] Thu Jan 26 23:50:11 GMT 2012 Thu Jan 26 23:47:53 GMT 2012
## 90 Levels: Fri Jan 27 00:40:50 GMT 2012 ... Wed Jan 25 23:50:08 GMT 2012
d = d[!(d$WorkerId=="A1VYX5VKZ0CTWU" & d$SubmitTime == "Thu Jan 26 20:45:26 GMT 2012") & !(d$WorkerId=="A1VYX5VKZ0CTWU" & d$SubmitTime == "Thu Jan 26 20:45:55 GMT 2012") & !(d$WorkerId=="A5IUO2T5MZQUZ" & d$SubmitTime == "Thu Jan 26 23:50:11 GMT 2012"),]
  1. Looking at Input Condition and converting to numeric
summary(d$Input.condition)
##    over rounded   under 
##      29      29      29
summary(d$Input.price1)
## 4,988 5,000 5,012 
##    29    29    29
table(d$Input.price1) #commas
## 
## 4,988 5,000 5,012 
##    29    29    29
summary(d$Input.price2) #commas
## 2,492 2,500 2,508 
##    29    29    29
table(d$Input.price2)
## 
## 2,492 2,500 2,508 
##    29    29    29
summary(d$Input.price3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    8.64    8.64    9.00    9.00    9.36    9.36
table(d$Input.price3)
## 
## 8.64    9 9.36 
##   29   29   29
#make commas to numeric
d$Input.price1 = as.character(d$Input.price1)
d$Input.price1[d$Input.price1=="4,988"] = "4988"
d$Input.price1[d$Input.price1=="5,000"] = "5000"
d$Input.price1[d$Input.price1=="5,012"] = "5012"
d$Input.price1 = as.numeric(d$Input.price1)

d$Input.price2 = as.character(d$Input.price2)
d$Input.price2[d$Input.price2=="2,492"] = "2492"
d$Input.price2[d$Input.price2=="2,500"] = "2500"
d$Input.price2[d$Input.price2=="2,508"] = "2508"
d$Input.price2 = as.numeric(d$Input.price2)
  1. Looking at Answers and converting to numeric
summary(d$Answer.dog_cost) #written number
##         1000         1500         1600         1658         1687 
##            6            6            2            0            0 
##         1700         1749         1750         1790         1800 
##            2            1            2            1            2 
##         1850         1875         1900         1950         1990 
##            1            1            4            1            1 
##        2,000         2000         2100         2192         2200 
##            1           26            2            1            8 
##         2250         2292         2299         2300         2325 
##            3            1            1            3            1 
##         2350         2400         2450         2482      2499.99 
##            1            1            2            1            1 
##         2500          500          800 five hundred 
##            1            1            1            1
summary(d$Answer.plasma_cost)
##    0.45    1200    1500    2000    2500    2999    3000    3200    3250 
##       1       1       1       1       4       1       3       2       1 
##    3258    3458    3499    3500    3800   4,000    4000    4120    4200 
##       0       0       1       6       1       1       9       1       2 
##    4300    4350    4400    4495    4500    4540    4578    4600    4650 
##       2       1       1       1      15       1       1       1       1 
##    4675    4699    4700    4750    4800    4849    4850    4888    4899 
##       1       1       5       2       4       1       1       2       1 
##    4900    4968    4995    4997    4999 4999.99    5000 
##       1       1       2       1       2       1       1
summary(d$Answer.sushi_cost) #written number
##           6.5   6.78   6.95   6.99      7   7,75   7.02   7.35    7.5 
##      1      1      0      2      3      3      1      0      1     10 
##   7.56   7.58   7.69   7.75    7.8   7.89   7.99      8   8.25   8.46 
##      1      1      1      3      2      2      9     18      2      1 
##   8.49    8.5   8.69    8.7   8.95   8.99      9    9.3 ehight 
##      1     11      1      1      2      4      3      1      1
d$Answer.dog_cost = as.character(d$Answer.dog_cost)
d$Answer.dog_cost[d$Answer.dog_cost=="five hundred"]="500"
d$Answer.dog_cost[d$Answer.dog_cost=="2,000"]="2000"
d$Answer.dog_cost = as.numeric(d$Answer.dog_cost)

d$Answer.plasma_cost = as.character(d$Answer.plasma_cost)
d$Answer.plasma_cost[d$Answer.plasma_cost=="4,000"]="4000"
d$Answer.plasma_cost[d$Answer.plasma_cost==".45"]="4500"
d$Answer.plasma_cost = as.numeric(d$Answer.plasma_cost)

d$Answer.sushi_cost = as.character(d$Answer.sushi_cost)
d$Answer.sushi_cost[d$Answer.sushi_cost=="ehight"]="8"
d$Answer.sushi_cost[d$Answer.sushi_cost=="7,75"]="7.75"
d$Answer.sushi_cost = as.numeric(d$Answer.sushi_cost)

Part 2: Making Data Tidy

d1 = read.csv("janiszewski_rep_cleaned.csv", header=T)
d.tidy = d1 %>% 
  select(WorkerId, Input.condition, Input.price1, Input.price2, Input.price3, Answer.dog_cost, Answer.plasma_cost, Answer.sushi_cost) %>% #selecting important variables
  
  rename(workerid = WorkerId, condition = Input.condition, plasma_anchor = Input.price1, dog_anchor = Input.price2, sushi_anchor = Input.price3, dog_cost = Answer.dog_cost, plasma_cost = Answer.plasma_cost, sushi_cost = Answer.sushi_cost) %>% #renaming variables
  
  gather(type, cost, dog_anchor, plasma_anchor, sushi_anchor, dog_cost, plasma_cost, sushi_cost) %>% #making two new variables: type and cost. type will be "different anchor types (dog, plasma, sushi)" and cost will be the anchor amount and the answer from the participant
  
  separate(type, c("type", "anchor")) %>% #now making type = dog, plasma, sushi and anchor = will just say anchor

  spread(anchor, cost) #now anchor will be associated with anchor cost

Part 3: Manipualting Data

  1. Looking at Histograms
d = ggplot(d.tidy, aes(cost)) 
d = d + geom_histogram()
d = d + facet_wrap(~type, scales="free")
d
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

The histogram tells me that while sushi looks mostly normally distributed, plasma looks right skewed and so does dog, to a lesser extent.

  1. Removing duplicates using dplyr

I already removed the duplicates before, but I will try it again here using the “distinct” function

d.raw = read.csv("../data/janiszewski_rep_exercise.csv")
d.unique.subs = distinct(d.raw, WorkerId)
  1. Getting the means by type and condition
d.sum = d.tidy%>% group_by(type, condition) %>%
  
  summarise (cost= mean(cost, na.rm=T))

d.sum
## Source: local data frame [9 x 3]
## Groups: type [?]
## 
##     type condition        cost
##    (chr)    (fctr)       (dbl)
## 1    dog      over 1898.300000
## 2    dog   rounded 1884.482414
## 3    dog     under 1906.964286
## 4 plasma      over 4300.333000
## 5 plasma   rounded 4091.655172
## 6 plasma     under 4018.357143
## 7  sushi      over    8.322414
## 8  sushi   rounded    7.955517
## 9  sushi     under    7.742500

I had already added the anchor in to begin with in my tidy data, so no need to repeat.

  1. Calculating absolute percentage change
pcts = d.tidy %>% 
  mutate(pct_change = abs(cost-anchor)/anchor) %>%
  group_by(type, condition) %>%
  
  summarise (pct_change= mean(pct_change, na.rm=T))

pcts
## Source: local data frame [9 x 3]
## Groups: type [?]
## 
##     type condition pct_change
##    (chr)    (fctr)      (dbl)
## 1    dog      over  0.2431021
## 2    dog   rounded  0.2462070
## 3    dog     under  0.2347655
## 4 plasma      over  0.1419926
## 5 plasma   rounded  0.1816690
## 6 plasma     under  0.1943951
## 7  sushi      over  0.1108532
## 8  sushi   rounded  0.1160536
## 9  sushi     under  0.1038773
  1. Calculating z score

Tested two ways to make z scores

z.scores = d.tidy %>%
  group_by(type) %>%
  mutate(mean = mean(cost,na.rm=T)) %>%
  mutate(sd= sd(cost,na.rm=T))%>%
  ungroup()%>%
  mutate(z = (cost-mean)/sd)%>%
  group_by(type,condition) %>%
  summarise (z= mean(z, na.rm=T))

z.scores
## Source: local data frame [9 x 3]
## Groups: type [?]
## 
##     type condition           z
##    (chr)    (fctr)       (dbl)
## 1    dog      over  0.00422195
## 2    dog   rounded -0.02787806
## 3    dog     under  0.02435019
## 4 plasma      over  0.19378470
## 5 plasma   rounded -0.05846743
## 6 plasma     under -0.14707091
## 7  sushi      over  0.53314479
## 8  sushi   rounded -0.09274373
## 9  sushi     under -0.45612967
  z.scores1 = d.tidy %>%
  group_by(type) %>%
  mutate(z = scale(cost)[,1])%>%
  ungroup()%>%
  group_by(type,condition) %>%
  summarise (z= mean(z, na.rm=T))

Plot

ggplot(pcts, aes(x = type, y = pct_change, fill =condition )) +
   geom_bar(position = position_dodge(), stat = "identity")

qplot(type, z, fill=condition, 
      position="dodge",
      stat="identity", geom="bar", 
      data=z.scores)  

The results are not straightforward. The percent change analysis suggests there is not much difference between the over, rounded, and under anchor conditions. However, the z score analysis suggests that the effect worked for sushi and plasma. This sheds light on how potential researcher degrees of freedom can influence what the results look like. Both of these things seem to measure the same thing, and yet, the analyses yield different results.