This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
###exercise 1
df <- read.csv("C:\\Users\\USER\\OneDrive\\İş masası\\ASOIU\\Statistical Analysis for data analitics\\hw.csv")
View(df)
require(ggplot2)
## Loading required package: ggplot2
# Weight
ggplot()+
geom_histogram(data=df, aes(x=weight, y=..density..),
fill="lightblue", color="black") +
stat_function(fun = dnorm,
args = list(mean = mean(df$weight, na.rm = TRUE),
sd = sd(df$weight, na.rm = TRUE)),
color = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
# Height
ggplot()+
geom_histogram(data=df, aes(x=height, y=..density..),
fill="lightgreen", color="black") +
stat_function(fun = dnorm,
args = list(mean = mean(df$height, na.rm = TRUE),
sd = sd(df$height, na.rm = TRUE)),
color = "red", size = 1)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
##Summary: If the bars follow the red line closely, the data is close to a normal distribution.In this chart, the data looks mostly normal, but not perfect — there are some ups and downs.
###exercise 2
df <- read.csv("C:\\Users\\USER\\OneDrive\\İş masası\\ASOIU\\Statistical Analysis for data analitics\\math.csv")
View(df)
require(ggplot2)
require(plyr)
## Loading required package: plyr
mydata1 <- count(df, 'grade1')
cumul1 <- cumsum(mydata1$freq)
cumperc1 <- cumul1 / nrow(df)
mydata1 <- cbind(mydata1, cumperc1)
mydata2 <- count(df, 'grade2')
cumul2 <- cumsum(mydata2$freq)
cumperc2 <- cumul2 / nrow(df)
mydata2 <- cbind(mydata2, cumperc2)
ggplot() +
geom_line(data = mydata1, aes(x = grade1, y = cumperc1, color = "Grade 1"), size = 1.3) +
geom_line(data = mydata2, aes(x = grade2, y = cumperc2, color = "Grade 2"), size = 1.3) +
scale_color_manual("Legend", values = c("Grade 1" = "red", "Grade 2" = "blue")) +
labs(x = "Grades", y = "Cumulative Percentage", title = "Cumulative Frequency Line Chart")
##Summary: The chart shows cumulative frequency lines for Grade 1 and Grade 2. The red line (Grade 1) rises earlier, while the blue line (Grade 2) starts higher, showing that Grade 2 scores are generally higher than Grade 1.
###exercise 3
#a
df1 <- read.csv("C:\\Users\\USER\\OneDrive\\İş masası\\ASOIU\\Statistical Analysis for data analitics\\directmail.csv")
View(df1)
require(ggplot2)
ggplot(df1, aes(x = educ, fill = gender)) +
geom_bar(position = "dodge") +
labs(x = "Education Level", y = "Count",
title = "Column Chart by Gender and Education") +
scale_fill_manual(values = c("pink", "lightblue"))
#stacked
ggplot(df1, aes(x = educ, fill = gender)) +
geom_bar(position = "stack") +
labs(x = "Education Level", y = "Count",
title = "Stacked Column Chart by Gender and Education")
##Summary: Females are more numerous than males in every education category, with the highest number among post-graduate respondents.
#b
df2 <- read.csv("C:\\Users\\USER\\OneDrive\\İş masası\\ASOIU\\Statistical Analysis for data analitics\\toyota.csv")
View(df2)
require(ggplot2)
ggplot(df2, aes(x = continent, fill = continent)) +
geom_bar() +
labs(x = "Continent", y = "Count",
title = "Column Chart of Continent")
##Summary: Asia has the most records, while Europe has the fewest.Asia has the highest count, followed by Australia and Africa. Europe has the lowest count, while North America and South America are in the middle range.Overall, the chart highlights differences in the number of entries across continents.
###exercise 4
df <- read.csv("C:\\Users\\USER\\OneDrive\\İş masası\\ASOIU\\Statistical Analysis for data analitics\\math.csv")
View(df)
require(ggplot2)
ggplot()+geom_point(data=df, aes(x=grade1, y=grade2))+
labs(title="Scatterplot of Grade1 vs Grade2",
x="Grade 1", y="Grade 2")
model <- lm(grade2 ~ grade1, data=df)
min_g1 <- min(df$grade1)
max_g1 <- max(df$grade1)
grade1 <- c(min_g1, max_g1)
fit <- predict(model, data.frame(grade1))
endpoints <- data.frame(grade1, fit)
ggplot()+
geom_point(data=df, aes(x=grade1, y=grade2))+
geom_line(data=endpoints, aes(x=grade1, y=fit),
color="red", size=1)+
labs(title="Scatterplot with Trend Line (Grade1 vs Grade2)",
x="Grade 1", y="Grade 2")
##Summary: This plot shows that there is almost no relationship between Grade 1 and Grade 2. The flat red line indicates a very weak or no correlation between the two grades.