d = read.csv("janiszewski_rep_exercise.csv", header=T)
d= d[,c("WorkerId", "SubmitTime", "Input.condition", "Input.price1", "Input.price2", "Input.price3", "Answer.dog_cost","Answer.plasma_cost", "Answer.sushi_cost")]
#summary statistics
summary(d$WorkerId) #repeats
## A10316ZXDCW4TT A14U5M64AK4MAR A16VNOCOSQS8H6 A17FFPDVAJ2SQJ A18L9BY7DNA8D3
## 1 1 1 1 1
## A1A70X1I06UZKH A1DCWHY56LLX11 A1F0K96LA51ICQ A1HQRT1UG6ERGL A1IAC1SAII8YSO
## 1 1 1 1 1
## A1JGZFWWT1KMVV A1OKTCXWVYNXE6 A1OYEERWIFE1E2 A1PYG25XQ6H75Z A1SFABJ4NX5DFY
## 1 1 1 1 1
## A1TQB6QX4RKBJV A1VYX5VKZ0CTWU A1W8FRDIXULFCW A1X4S0LBLQ5ICD A1YGOYNWW3Q9SQ
## 1 3 1 1 1
## A1YQ5UO86XM73Z A1ZLYD5GP1FLYL A21JRS3L6PMJ5Q A21WM0TU34VX0H A228LSM1VQ7GKY
## 1 1 1 1 1
## A26B18YJBK0RF5 A28GTG78A8VHA8 A29JHEW30K9LIU A2AMJWFVBY4TOX A2AVFNOKFHQ2ME
## 1 1 1 1 1
## A2B9QSYOSS5YTM A2BE9ZY7SQJUI1 A2CGAOF4G65D67 A2CJ265G7KDQPQ A2CQ6N923E0AHA
## 1 1 1 1 1
## A2EUAY8BZ2H272 A2FNO46BROC6MW A2FZNTF2DA43I0 A2GREHAB3I8YTW A2HU0HN75JRT9J
## 1 1 1 1 1
## A2JKCS6UCAUARS A2NFI8WZXF9JYD A2P1GEHANE46W1 A2PDWVFT2AVEW9 A2TL18J7K3G2A3
## 1 1 1 1 1
## A2UOKD7ZRDQA0K A2ZJ282IMK4033 A36BYL3PXBRQHR A3800T0V4VADFI A38E9ET5TC41WD
## 1 1 1 1 1
## A3AJLUNBK4EU68 A3CWGT3EMQ17Y0 A3D5C3AR576B1T A3EFN4B29IY00A A3G07E5QWOJTAR
## 1 1 1 1 1
## A3G4OOZ25NZU3I A3HMBHM8HJLKRD A3IM0Z8F01XLE6 A3JQS4QJ5YMWZX A3LBUWPBL3FV59
## 1 1 1 1 1
## A3LQAX287GQ35Q A3NBCP9NSX3670 A3NS3B2BFI6AAO A3O97JQ9J6RHLM A3OWHW7XYQU52K
## 1 1 1 1 1
## A3R1YTYGRQEZU3 A3S99S9NJ6X08J A4X4GQMK34FGB A54HH8I8XLIEP A5IUO2T5MZQUZ
## 1 1 1 1 2
## A9J48W1C9IZAI AAOA3WLON3GKX AB6C1SZWVNQCC ACSZ7IOG3J5NR AEA1MB8LFXEGH
## 1 1 1 1 1
## AFODO5KH3V4EN AG5OJVGC1ZH8Q AHV4U78TUUDKI AI2GXFO5FLLZ1 AKDETA3O440J7
## 1 1 1 1 1
## AMBTRKSCFZU2M AN6NSLCHI8SY4 AR3I3KJIGB4U6 AT2SP7P5DXNUI ATOES4DDGVTGZ
## 1 1 1 1 1
## AW6DOBYV5GVN6 AWR6MOBX9YSOM
## 1 1
d$WorkerId[duplicated(d$WorkerId)==T] # A1VYX5VKZ0CTWU A5IUO2T5MZQUZ have duplicates
## [1] A1VYX5VKZ0CTWU A5IUO2T5MZQUZ A1VYX5VKZ0CTWU
## 87 Levels: A10316ZXDCW4TT A14U5M64AK4MAR A16VNOCOSQS8H6 ... AWR6MOBX9YSOM
#I am going to keep the earliest ones of each of these.
d$SubmitTime[d$WorkerId=="A1VYX5VKZ0CTWU"] #later ones are Thu Jan 26 20:45:26 GMT 2012 Thu Jan 26 20:45:55 GMT 2012
## [1] Thu Jan 26 20:44:20 GMT 2012 Thu Jan 26 20:45:26 GMT 2012
## [3] Thu Jan 26 20:45:55 GMT 2012
## 90 Levels: Fri Jan 27 00:40:50 GMT 2012 ... Wed Jan 25 23:50:08 GMT 2012
d$SubmitTime[d$WorkerId=="A5IUO2T5MZQUZ"] #later one is Thu Jan 26 23:50:11 GMT 2012
## [1] Thu Jan 26 23:50:11 GMT 2012 Thu Jan 26 23:47:53 GMT 2012
## 90 Levels: Fri Jan 27 00:40:50 GMT 2012 ... Wed Jan 25 23:50:08 GMT 2012
d = d[!(d$WorkerId=="A1VYX5VKZ0CTWU" & d$SubmitTime == "Thu Jan 26 20:45:26 GMT 2012") & !(d$WorkerId=="A1VYX5VKZ0CTWU" & d$SubmitTime == "Thu Jan 26 20:45:55 GMT 2012") & !(d$WorkerId=="A5IUO2T5MZQUZ" & d$SubmitTime == "Thu Jan 26 23:50:11 GMT 2012"),]
summary(d$Input.condition)
## over rounded under
## 29 29 29
summary(d$Input.price1)
## 4,988 5,000 5,012
## 29 29 29
table(d$Input.price1) #commas
##
## 4,988 5,000 5,012
## 29 29 29
summary(d$Input.price2) #commas
## 2,492 2,500 2,508
## 29 29 29
table(d$Input.price2)
##
## 2,492 2,500 2,508
## 29 29 29
summary(d$Input.price3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.64 8.64 9.00 9.00 9.36 9.36
table(d$Input.price3)
##
## 8.64 9 9.36
## 29 29 29
#make commas to numeric
d$Input.price1 = as.character(d$Input.price1)
d$Input.price1[d$Input.price1=="4,988"] = "4988"
d$Input.price1[d$Input.price1=="5,000"] = "5000"
d$Input.price1[d$Input.price1=="5,012"] = "5012"
d$Input.price1 = as.numeric(d$Input.price1)
d$Input.price2 = as.character(d$Input.price2)
d$Input.price2[d$Input.price2=="2,492"] = "2492"
d$Input.price2[d$Input.price2=="2,500"] = "2500"
d$Input.price2[d$Input.price2=="2,508"] = "2508"
d$Input.price2 = as.numeric(d$Input.price2)
summary(d$Answer.dog_cost) #written number
## 1000 1500 1600 1658 1687
## 6 6 2 0 0
## 1700 1749 1750 1790 1800
## 2 1 2 1 2
## 1850 1875 1900 1950 1990
## 1 1 4 1 1
## 2,000 2000 2100 2192 2200
## 1 26 2 1 8
## 2250 2292 2299 2300 2325
## 3 1 1 3 1
## 2350 2400 2450 2482 2499.99
## 1 1 2 1 1
## 2500 500 800 five hundred
## 1 1 1 1
summary(d$Answer.plasma_cost)
## 0.45 1200 1500 2000 2500 2999 3000 3200 3250
## 1 1 1 1 4 1 3 2 1
## 3258 3458 3499 3500 3800 4,000 4000 4120 4200
## 0 0 1 6 1 1 9 1 2
## 4300 4350 4400 4495 4500 4540 4578 4600 4650
## 2 1 1 1 15 1 1 1 1
## 4675 4699 4700 4750 4800 4849 4850 4888 4899
## 1 1 5 2 4 1 1 2 1
## 4900 4968 4995 4997 4999 4999.99 5000
## 1 1 2 1 2 1 1
summary(d$Answer.sushi_cost) #written number
## 6.5 6.78 6.95 6.99 7 7,75 7.02 7.35 7.5
## 1 1 0 2 3 3 1 0 1 10
## 7.56 7.58 7.69 7.75 7.8 7.89 7.99 8 8.25 8.46
## 1 1 1 3 2 2 9 18 2 1
## 8.49 8.5 8.69 8.7 8.95 8.99 9 9.3 ehight
## 1 11 1 1 2 4 3 1 1
d$Answer.dog_cost = as.character(d$Answer.dog_cost)
d$Answer.dog_cost[d$Answer.dog_cost=="five hundred"]="500"
d$Answer.dog_cost[d$Answer.dog_cost=="2,000"]="2000"
d$Answer.dog_cost = as.numeric(d$Answer.dog_cost)
d$Answer.plasma_cost = as.character(d$Answer.plasma_cost)
d$Answer.plasma_cost[d$Answer.plasma_cost=="4,000"]="4000"
d$Answer.plasma_cost[d$Answer.plasma_cost==".45"]="4500"
d$Answer.plasma_cost = as.numeric(d$Answer.plasma_cost)
d$Answer.sushi_cost = as.character(d$Answer.sushi_cost)
d$Answer.sushi_cost[d$Answer.sushi_cost=="ehight"]="8"
d$Answer.sushi_cost[d$Answer.sushi_cost=="7,75"]="7.75"
d$Answer.sushi_cost = as.numeric(d$Answer.sushi_cost)
d1 = read.csv("janiszewski_rep_cleaned.csv", header=T)
d.tidy = d1 %>%
select(WorkerId, Input.condition, Input.price1, Input.price2, Input.price3, Answer.dog_cost, Answer.plasma_cost, Answer.sushi_cost) %>% #selecting important variables
rename(workerid = WorkerId, condition = Input.condition, plasma_anchor = Input.price1, dog_anchor = Input.price2, sushi_anchor = Input.price3, dog_cost = Answer.dog_cost, plasma_cost = Answer.plasma_cost, sushi_cost = Answer.sushi_cost) %>% #renaming variables
gather(type, cost, dog_anchor, plasma_anchor, sushi_anchor, dog_cost, plasma_cost, sushi_cost) %>% #making two new variables: type and cost. type will be "different anchor types (dog, plasma, sushi)" and cost will be the anchor amount and the answer from the participant
separate(type, c("type", "anchor")) %>% #now making type = dog, plasma, sushi and anchor = will just say anchor
spread(anchor, cost) #now anchor will be associated with anchor cost
d = ggplot(d.tidy, aes(cost))
d = d + geom_histogram()
d = d + facet_wrap(~type, scales="free")
d
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
The histogram tells me that while sushi looks mostly normally distributed, plasma looks right skewed and so does dog, to a lesser extent.
I already removed the duplicates before, but I will try it again here using the “distinct” function
d.raw = read.csv("../data/janiszewski_rep_exercise.csv")
d.unique.subs = distinct(d.raw, WorkerId)
d.sum = d.tidy%>% group_by(type, condition) %>%
summarise (cost= mean(cost, na.rm=T))
d.sum
## Source: local data frame [9 x 3]
## Groups: type [?]
##
## type condition cost
## (chr) (fctr) (dbl)
## 1 dog over 1898.300000
## 2 dog rounded 1884.482414
## 3 dog under 1906.964286
## 4 plasma over 4300.333000
## 5 plasma rounded 4091.655172
## 6 plasma under 4018.357143
## 7 sushi over 8.322414
## 8 sushi rounded 7.955517
## 9 sushi under 7.742500
I had already added the anchor in to begin with in my tidy data, so no need to repeat.
pcts = d.tidy %>%
mutate(pct_change = abs(cost-anchor)/anchor) %>%
group_by(type, condition) %>%
summarise (pct_change= mean(pct_change, na.rm=T))
pcts
## Source: local data frame [9 x 3]
## Groups: type [?]
##
## type condition pct_change
## (chr) (fctr) (dbl)
## 1 dog over 0.2431021
## 2 dog rounded 0.2462070
## 3 dog under 0.2347655
## 4 plasma over 0.1419926
## 5 plasma rounded 0.1816690
## 6 plasma under 0.1943951
## 7 sushi over 0.1108532
## 8 sushi rounded 0.1160536
## 9 sushi under 0.1038773
Tested two ways to make z scores
z.scores = d.tidy %>%
group_by(type) %>%
mutate(mean = mean(cost,na.rm=T)) %>%
mutate(sd= sd(cost,na.rm=T))%>%
ungroup()%>%
mutate(z = (cost-mean)/sd)%>%
group_by(type,condition) %>%
summarise (z= mean(z, na.rm=T))
z.scores
## Source: local data frame [9 x 3]
## Groups: type [?]
##
## type condition z
## (chr) (fctr) (dbl)
## 1 dog over 0.00422195
## 2 dog rounded -0.02787806
## 3 dog under 0.02435019
## 4 plasma over 0.19378470
## 5 plasma rounded -0.05846743
## 6 plasma under -0.14707091
## 7 sushi over 0.53314479
## 8 sushi rounded -0.09274373
## 9 sushi under -0.45612967
z.scores1 = d.tidy %>%
group_by(type) %>%
mutate(z = scale(cost)[,1])%>%
ungroup()%>%
group_by(type,condition) %>%
summarise (z= mean(z, na.rm=T))
Plot
ggplot(pcts, aes(x = type, y = pct_change, fill =condition )) +
geom_bar(position = position_dodge(), stat = "identity")
qplot(type, z, fill=condition,
position="dodge",
stat="identity", geom="bar",
data=z.scores)
The results are not straightforward. The percent change analysis suggests there is not much difference between the over, rounded, and under anchor conditions. However, the z score analysis suggests that the effect worked for sushi and plasma. This sheds light on how potential researcher degrees of freedom can influence what the results look like. Both of these things seem to measure the same thing, and yet, the analyses yield different results.