The syntax used in this example is written using ChatGPT with the
follwing prompt:
- I have a dataset that includes the following variable names:______.
Please refer to the dataset above and write some R syntax that allows me
to analyze the relationship between ______________ and ________(in my
case, I used Percentage_markup and profit from the dataset).
# Load required packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the dataset (replace 'beer_data.csv' with the actual filename)
beer_data <- read_csv("beer_data.csv")
## Rows: 99 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date_and_time_of_unloading
## dbl (7): Product_code, Amount, Sale_amount, Discount_amount, Profit, Percent...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View the first few rows of the dataset
head(beer_data)
## # A tibble: 6 × 8
## Date_and_time_of_unlo…¹ Product_code Amount Sale_amount Discount_amount Profit
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1/1/2020 23:00 144 1 280 NA 155
## 2 1/1/2020 23:00 209 2 546. 294. 75.7
## 3 1/1/2020 23:00 213 2 1265. 35.0 653.
## 4 1/1/2020 23:00 217 1 630 70 220.
## 5 1/1/2020 23:00 222 2 1105. 195. 394.
## 6 1/1/2020 23:00 243 1 292. 158. 1.36
## # ℹ abbreviated name: ¹​Date_and_time_of_unloading
## # ℹ 2 more variables: Percentage_markup <dbl>, Discount_percentage <dbl>
# Check the structure of the dataset
str(beer_data)
## spc_tbl_ [99 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date_and_time_of_unloading: chr [1:99] "1/1/2020 23:00" "1/1/2020 23:00" "1/1/2020 23:00" "1/1/2020 23:00" ...
## $ Product_code : num [1:99] 144 209 213 217 222 243 258 270 271 282 ...
## $ Amount : num [1:99] 1 2 2 1 2 1 1 2 1 1 ...
## $ Sale_amount : num [1:99] 280 546 1265 630 1105 ...
## $ Discount_amount : num [1:99] NA 294 35 70 195 ...
## $ Profit : num [1:99] 155 75.7 653 220.5 393.8 ...
## $ Percentage_markup : num [1:99] 124 16.1 106.7 53.9 55.4 ...
## $ Discount_percentage : num [1:99] NA 35.03 2.69 10 15.02 ...
## - attr(*, "spec")=
## .. cols(
## .. Date_and_time_of_unloading = col_character(),
## .. Product_code = col_double(),
## .. Amount = col_double(),
## .. Sale_amount = col_double(),
## .. Discount_amount = col_double(),
## .. Profit = col_double(),
## .. Percentage_markup = col_double(),
## .. Discount_percentage = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Check for any missing values in Percentage Markup and Profit columns
sum(is.na(beer_data$Percentage_markup))
## [1] 2
sum(is.na(beer_data$Profit))
## [1] 0
# Summary statistics for Percentage Markup and Profit
summary(beer_data$Percentage_markup)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -13.91 40.00 68.58 70.69 100.00 182.00 2
summary(beer_data$Profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -160.00 66.29 134.00 185.80 240.72 929.28
# Scatter plot of Percentage Markup vs. Profit
ggplot(beer_data, aes(x = Percentage_markup, y = Profit)) +
geom_point() +
labs(x = "Percentage Markup", y = "Profit") +
ggtitle("Scatter Plot of Percentage Markup vs. Profit")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Calculate correlation coefficient between Percentage Markup and Profit
correlation <- cor(beer_data$Percentage_markup, beer_data$Profit)
print(paste("Correlation coefficient:", correlation))
## [1] "Correlation coefficient: NA"
# Linear regression model to predict Profit based on Percentage Markup
lm_model <- lm(Profit ~ Percentage_markup, data = beer_data)
summary(lm_model)
##
## Call:
## lm(formula = Profit ~ Percentage_markup, data = beer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -234.42 -96.64 -31.68 26.09 646.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.2702 32.9231 0.828 0.41
## Percentage_markup 2.2266 0.4047 5.502 3.17e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160.4 on 95 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2416, Adjusted R-squared: 0.2337
## F-statistic: 30.27 on 1 and 95 DF, p-value: 3.171e-07
# Visualize the regression line on the scatter plot
ggplot(beer_data, aes(x = Percentage_markup, y = Profit)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(x = "Percentage Markup", y = "Profit") +
ggtitle("Linear Regression: Percentage Markup vs. Profit")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
