CS544 Final Project-Analysis of Ramen data

#Dataset detail: The ramen dataset is collected from the kaggle. Each record in the dataset is a single ramen This dataset has 6 variables: review number,brand,variety,style,country and stars

#Sample detail: There are 7 styles of ramen involved in the survey. Brands from 38 countries and regions are analysed.

ramen<-read.csv("ramen.csv",TRUE,",")
nrow(ramen)

## [1] 2580

table(ramen$Style)

## 
##       Bar Bowl  Box  Can  Cup Pack Tray 
##    2    1  481    6    1  450 1531  108

table(ramen$Country)

## 
##     Australia    Bangladesh        Brazil      Cambodia        Canada 
##            22             7             5             5            41 
##         China      Colombia         Dubai       Estonia          Fiji 
##           169             6             3             2             4 
##       Finland       Germany         Ghana       Holland     Hong Kong 
##             3            27             2             4           137 
##       Hungary         India     Indonesia         Japan      Malaysia 
##             9            31           126           352           156 
##        Mexico       Myanmar         Nepal   Netherlands       Nigeria 
##            25            14            14            15             1 
##      Pakistan   Philippines        Poland       Sarawak     Singapore 
##             9            47             4             3           109 
##   South Korea        Sweden        Taiwan      Thailand            UK 
##           309             3           224           191            69 
## United States           USA       Vietnam 
##             1           323           108

As shown in the pie chart, Pack is the most popular ramen style, follwed by Cup, Bowl adn Tray

pie(table(ramen$Style),main="Ramen Styles",col=rainbow(6))

Most ramen brands are from Japan, followed by South Korea and USA

barplot(table(ramen$Country),col="blue",ylim=c(0,360),ylab="frequency",las=2,main="Countries and Regions")

#Ramen stars analysis: There are 2580 ramen reviews in total Average stars is 3.67. The poorest star is 1 while the highest stars is 5 The distribution of stars is skewed to the left, indicating the mean is smaller than the median Most reviewers gave more than 3 stars on ramen brands

summary(ramen$Stars)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.250   3.750   3.673   4.250   5.000

table(ramen$Stars>3)

## 
## FALSE  TRUE 
##   586  1994

labels <-c("Stars no more than 3","Stars more than 3 ")
pie(table(ramen$Stars>3),col=rainbow(2),labels=labels,main="Ramen stars")

boxplot(ramen$Stars,horizontal = TRUE)

hist(ramen$Stars,ylim=c(0,800), col="red", xlab="Stars",main="Ramen stars")

All four major styles enjoy similar ratings Overall Japanese ramen brands are rated higher than other brands

par(mfrow=c(2,2))
boxplot(ramen$Stars[ramen$Style=="Pack"],horizontal = TRUE,main="Pack",col="red")
boxplot(ramen$Stars[ramen$Style=="Bowl"],horizontal = TRUE,main="Bowl",col="blue")
boxplot(ramen$Stars[ramen$Style=="Cup"],horizontal = TRUE,main="Cup",col="yellow")
boxplot(ramen$Stars[ramen$Style=="Tray"],horizontal = TRUE,main="Tray",col="green")

As sample size increase, the spread of sample size becomes narrower The means for all sample size are equal Increasing in sample size becomes less skewed, slowly approaching the shape of normal distribution

par(mfrow=c(1,3))
samples<-2580
sample.size<-10
xbar<-numeric(samples)
for(i in 1:samples){
  xbar[i]<-mean(sample(ramen$Stars,size=sample.size,replace=TRUE))
}
hist(xbar,col="red",xlim=c(0,5),ylim=c(0,800),main="sample size =10")
samples<-2580
sample.size<-50
xbar<-numeric(samples)
for(i in 1:samples){
  xbar[i]<-mean(sample(ramen$Stars,size=sample.size,replace=TRUE))
}
hist(xbar,col="red",xlim=c(0,5),ylim=c(0,800),main="sample size =50")
samples<-2580
sample.size<-100
xbar<-numeric(samples)
for(i in 1:samples){
  xbar[i]<-mean(sample(ramen$Stars,size=sample.size,replace=TRUE))
}
hist(xbar,col="red",xlim=c(2,5),ylim=c(0,800),main="sample size =100")

Single random sampling, systematic sampling, and systematic sampling with unequal probabilities are performed. The chance of a country brand being selected will change with different sampling method If samples using systematic sampling with unequal probabilities are used instead of the whole dataset, the average ramen rating will be higher.

library(sampling)

## Warning: package 'sampling' was built under R version 4.0.3

sample.size<-50
n<-nrow(ramen)
s<-srswor(sample.size,n)
R1<-ramen[s!=0,]
s1<-as.data.frame(table(ramen$Country))
prop.table(table(R1$Country))

## 
##    Cambodia       China     Finland     Germany   Hong Kong       India 
##        0.02        0.10        0.02        0.02        0.08        0.04 
##   Indonesia       Japan    Malaysia      Mexico       Nepal Netherlands 
##        0.04        0.10        0.02        0.02        0.02        0.04 
##   Singapore South Korea      Taiwan    Thailand         USA     Vietnam 
##        0.02        0.12        0.06        0.08        0.14        0.06

N <- nrow(ramen)
n <- sample.size
k <- ceiling(N/n)
r <- sample(k,1)
s <- seq(r,by=k,length=n)
R2<- ramen[s,]
s2 <- as.data.frame(table(R2$Country))
prop.table(table(R2$Country))

## 
##      Canada       China     Germany   Hong Kong       India   Indonesia 
##        0.02        0.06        0.02        0.02        0.06        0.02 
##       Japan    Malaysia Philippines   Singapore South Korea      Taiwan 
##        0.12        0.06        0.02        0.04        0.20        0.10 
##    Thailand          UK         USA     Vietnam 
##        0.12        0.04        0.06        0.04

pik <- inclusionprobabilities(ramen$Stars,50)
s <- UPsystematic(pik)
R3 <- ramen[s!=0,]
s3 <- as.data.frame(table(R3$Country))
prop.table(table(R3$Country))

## 
##   Australia      Canada       China     Germany   Hong Kong     Hungary 
##        0.02        0.02        0.04        0.02        0.04        0.02 
##       India   Indonesia       Japan    Malaysia   Singapore South Korea 
##        0.02        0.10        0.10        0.06        0.04        0.10 
##      Taiwan    Thailand          UK         USA     Vietnam 
##        0.06        0.16        0.02        0.12        0.06

par(mfrow=c(1,1))
a<-c(mean(ramen$Stars),mean(R1$Stars),mean(R2$Stars),mean(R3$Stars))
b<-c("Ramen sample","Single random","Systematic","Unequal probabilities")
c<-data.frame(a,b)
bp<-barplot(c$a,ylim=c(0,5),col="red",main="Average of Ramen Stars",ylab="stars",name=b)
text(bp, 0, round(c$a,2),cex=2,pos=3)