#Dataset detail: The ramen dataset is collected from the kaggle. Each record in the dataset is a single ramen This dataset has 6 variables: review number,brand,variety,style,country and stars
#Sample detail: There are 7 styles of ramen involved in the survey. Brands from 38 countries and regions are analysed.
ramen<-read.csv("ramen.csv",TRUE,",")
nrow(ramen)
## [1] 2580
table(ramen$Style)
##
## Bar Bowl Box Can Cup Pack Tray
## 2 1 481 6 1 450 1531 108
table(ramen$Country)
##
## Australia Bangladesh Brazil Cambodia Canada
## 22 7 5 5 41
## China Colombia Dubai Estonia Fiji
## 169 6 3 2 4
## Finland Germany Ghana Holland Hong Kong
## 3 27 2 4 137
## Hungary India Indonesia Japan Malaysia
## 9 31 126 352 156
## Mexico Myanmar Nepal Netherlands Nigeria
## 25 14 14 15 1
## Pakistan Philippines Poland Sarawak Singapore
## 9 47 4 3 109
## South Korea Sweden Taiwan Thailand UK
## 309 3 224 191 69
## United States USA Vietnam
## 1 323 108
As shown in the pie chart, Pack is the most popular ramen style, follwed by Cup, Bowl adn Tray
pie(table(ramen$Style),main="Ramen Styles",col=rainbow(6))
Most ramen brands are from Japan, followed by South Korea and USA
barplot(table(ramen$Country),col="blue",ylim=c(0,360),ylab="frequency",las=2,main="Countries and Regions")
#Ramen stars analysis: There are 2580 ramen reviews in total Average stars is 3.67. The poorest star is 1 while the highest stars is 5 The distribution of stars is skewed to the left, indicating the mean is smaller than the median Most reviewers gave more than 3 stars on ramen brands
summary(ramen$Stars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.250 3.750 3.673 4.250 5.000
table(ramen$Stars>3)
##
## FALSE TRUE
## 586 1994
labels <-c("Stars no more than 3","Stars more than 3 ")
pie(table(ramen$Stars>3),col=rainbow(2),labels=labels,main="Ramen stars")
boxplot(ramen$Stars,horizontal = TRUE)
hist(ramen$Stars,ylim=c(0,800), col="red", xlab="Stars",main="Ramen stars")
All four major styles enjoy similar ratings Overall Japanese ramen brands are rated higher than other brands
par(mfrow=c(2,2))
boxplot(ramen$Stars[ramen$Style=="Pack"],horizontal = TRUE,main="Pack",col="red")
boxplot(ramen$Stars[ramen$Style=="Bowl"],horizontal = TRUE,main="Bowl",col="blue")
boxplot(ramen$Stars[ramen$Style=="Cup"],horizontal = TRUE,main="Cup",col="yellow")
boxplot(ramen$Stars[ramen$Style=="Tray"],horizontal = TRUE,main="Tray",col="green")
As sample size increase, the spread of sample size becomes narrower The means for all sample size are equal Increasing in sample size becomes less skewed, slowly approaching the shape of normal distribution
par(mfrow=c(1,3))
samples<-2580
sample.size<-10
xbar<-numeric(samples)
for(i in 1:samples){
xbar[i]<-mean(sample(ramen$Stars,size=sample.size,replace=TRUE))
}
hist(xbar,col="red",xlim=c(0,5),ylim=c(0,800),main="sample size =10")
samples<-2580
sample.size<-50
xbar<-numeric(samples)
for(i in 1:samples){
xbar[i]<-mean(sample(ramen$Stars,size=sample.size,replace=TRUE))
}
hist(xbar,col="red",xlim=c(0,5),ylim=c(0,800),main="sample size =50")
samples<-2580
sample.size<-100
xbar<-numeric(samples)
for(i in 1:samples){
xbar[i]<-mean(sample(ramen$Stars,size=sample.size,replace=TRUE))
}
hist(xbar,col="red",xlim=c(2,5),ylim=c(0,800),main="sample size =100")
Single random sampling, systematic sampling, and systematic sampling with unequal probabilities are performed. The chance of a country brand being selected will change with different sampling method If samples using systematic sampling with unequal probabilities are used instead of the whole dataset, the average ramen rating will be higher.
library(sampling)
## Warning: package 'sampling' was built under R version 4.0.3
sample.size<-50
n<-nrow(ramen)
s<-srswor(sample.size,n)
R1<-ramen[s!=0,]
s1<-as.data.frame(table(ramen$Country))
prop.table(table(R1$Country))
##
## Cambodia China Finland Germany Hong Kong India
## 0.02 0.10 0.02 0.02 0.08 0.04
## Indonesia Japan Malaysia Mexico Nepal Netherlands
## 0.04 0.10 0.02 0.02 0.02 0.04
## Singapore South Korea Taiwan Thailand USA Vietnam
## 0.02 0.12 0.06 0.08 0.14 0.06
N <- nrow(ramen)
n <- sample.size
k <- ceiling(N/n)
r <- sample(k,1)
s <- seq(r,by=k,length=n)
R2<- ramen[s,]
s2 <- as.data.frame(table(R2$Country))
prop.table(table(R2$Country))
##
## Canada China Germany Hong Kong India Indonesia
## 0.02 0.06 0.02 0.02 0.06 0.02
## Japan Malaysia Philippines Singapore South Korea Taiwan
## 0.12 0.06 0.02 0.04 0.20 0.10
## Thailand UK USA Vietnam
## 0.12 0.04 0.06 0.04
pik <- inclusionprobabilities(ramen$Stars,50)
s <- UPsystematic(pik)
R3 <- ramen[s!=0,]
s3 <- as.data.frame(table(R3$Country))
prop.table(table(R3$Country))
##
## Australia Canada China Germany Hong Kong Hungary
## 0.02 0.02 0.04 0.02 0.04 0.02
## India Indonesia Japan Malaysia Singapore South Korea
## 0.02 0.10 0.10 0.06 0.04 0.10
## Taiwan Thailand UK USA Vietnam
## 0.06 0.16 0.02 0.12 0.06
par(mfrow=c(1,1))
a<-c(mean(ramen$Stars),mean(R1$Stars),mean(R2$Stars),mean(R3$Stars))
b<-c("Ramen sample","Single random","Systematic","Unequal probabilities")
c<-data.frame(a,b)
bp<-barplot(c$a,ylim=c(0,5),col="red",main="Average of Ramen Stars",ylab="stars",name=b)
text(bp, 0, round(c$a,2),cex=2,pos=3)