Nguồn dữ liệu có thể tải về trên webssite https://giangtranvn.com/course/ung-dung-r-trong-mo-hinh-thong-ke/?tab=tab-overview
Ta có thể tải về khai thác hoặc lấy dữ liệu trực tiếp tải về
library(dplyr)
library(tidyr)
library(ggplot2)
library(readxl)
install.packages(“e1071”)
library(e1071)
setwd(‘E:/R/1.LAP TONG HOP R/R_Lab’)
library(readxl)
temp_file <- tempfile(fileext = “.xlsx”)
download.file(“https://docs.google.com/spreadsheets/d/1His9ifK-VaE3bukaqQ0axuinfCfW4CpS/export?format=xlsx&gid=1159255179”, destfile = temp_file, mode = “wb”)
creditdata <- read_excel(temp_file)
head(creditdata)
unlink(temp_file)
credit_data <- read_excel(“creditdata.xlsx”, sheet = 1)
warnings()
library(readr)
df <- read_csv(“file.csv”, na = c(“NA”, “N/A”, ““))
creditdata <- creditdata creditdata [is.na(creditdata)] <- 0
str(creditdata) # xem cấu trúc dữ liệu glimpse(creditdata) head(creditdata)
View(creditdata) # thay thế tên tiêu đề các giá trị cách trống băng _ để hoạt động cho dễ
names(creditdata) <- gsub(” “,”_“, names(creditdata))
creditdata %>% select(Annual_Income, Monthly_Debt) %>% # hiển thị 2 cột head(6)
ggplot(creditdata, aes(Annual_Income, Credit_Score)) + geom_point() + geom_smooth(method = “lm”, se = FALSE, color = “blue”) + labs( title = “Scatterplot: Annual Income vs. Credit Score”, x = “Annual Income (USD)”, y = “Credit Score” ) + theme( plot.title = element_text(family = “Times New Roman”, size = 20, hjust = 0.5), axis.title = element_text(family = “Times New Roman”, size = 18))
ggplot(creditdata, aes(Annual_Income, Current_Credit_Balance)) + geom_point() + geom_smooth(method = “lm”, se = FALSE, color = “blue”) + labs( title = “Scatterplot: Annual Income vs. Current_Credit_Balance”, x = “Annual Income (USD)”, y = “Current_Credit_Balance” ) + theme( plot.title = element_text(family = “Arial”, size = 20, hjust = 0.5), axis.title = element_text(family = “Arial”, size = 18))
lm(Monthly_Debt ~ Annual_Income, creditdata)
creditdata\(Home_Ownership <- as.factor(creditdata\)Home_Ownership) ggplot(creditdata, aes(x = Home_Ownership, y = Monthly_Debt)) + geom_boxplot(fill = “#0072ce”, width =0.4) + labs( title = “Monthly Debt by Home Ownership”, x = “Hình thức sở hữu nhà”, y = “Chi phí trả nợ hàng tháng (USD)” ) + theme( axis.text.x = element_text(family = “Arial”, size = 14), axis.text.y = element_text(family = “Arial”, size = 14, angle = 0), plot.title = element_text(family = “Arial”, size = 24,hjust = 0.5), axis.title = element_text(family = “Arial”, size = 18) )
model_cat <- lm(Monthly_Debt ~ Annual_Income + Home_Ownership, data = creditdata)
summary(model_cat)
summary(model)
creditdata\(Annual_Income <- ifelse(is.na(creditdata\)Annual_Income), median(creditdata\(Annual_Income, na.rm = TRUE), creditdata\)Annual_Income)
creditdata\(Credit_Score <- ifelse(is.na(creditdata\)Credit_Score), median(creditdata\(Credit_Score, na.rm = TRUE), creditdata\)Credit_Score)
creditdata\(Years_of_Credit_History <- ifelse(is.na(creditdata\)Years_of_Credit_History), median(creditdata\(Years_of_Credit_History, na.rm = TRUE), creditdata\)Years_of_Credit_History)
creditdata\(Revenue <- round( 0.0005 * creditdata\)Annual_Income + 2 * creditdata\(Credit_Score + 10 * creditdata\)Years_of_Credit_History + rnorm(nrow(creditdata), mean = 0, sd = 500), 2 )
creditdata\(Revenue <- ifelse(creditdata\)Revenue < 0, 0, creditdata$Revenue)
head(creditdata[, c(“Annual_Income”, “Credit_Score”, “Years_of_Credit_History”, “Revenue”)])
creditdata\(Card_Fee <- round(
0.5 * creditdata\)Number_of_Open_Accounts +
ifelse(creditdata\(Annual_Income > 100000,
25, 10) + # phí gói theo thu nhập
ifelse(creditdata\)Credit_Score < 650, 15, 0) + # phụ phí
rủi ro ifelse(creditdata$Bankruptcies == 1, 20, 0) + # phụ phí phá sản
rnorm(nrow(creditdata), 0, 3), 2 # nhiễu ngẫu nhiên )
head(creditdata[, c(“Number_of_Open_Accounts”, “Annual_Income”, “Credit_Score”, “Bankruptcies”, “Card_Fee”)])
model_revenue <- lm(Revenue ~ Card_Fee,data = creditdata)
summary(model_revenue)
model_multiple_revenue <- lm(Revenue ~ Card_Fee + Annual_Income + Credit_Score + Bankruptcies + Home_Ownership,data = creditdata)
summary(model_multiple_revenue)
library(tibble) library(dplyr)
explanatory_data <- expand_grid( Card_Fee = seq(20, 120, by = 20), Annual_Income = seq(30000, 120000, by = 30000), Credit_Score = c(600, 700, 800), Bankruptcies = factor(0:1), # chú ý phải là factor nếu trong model là factor Home_Ownership = factor(c(“Rent”, “Own Home”, “Home Mortgage”)))
prediction_data <- explanatory_data %>% mutate(Predicted_Revenue = predict(model_multiple_revenue, newdata = explanatory_data))
head(prediction_data) library(viridis)
ggplot(creditdata, aes(x = Card_Fee, y = Annual_Income, color = Revenue)) + geom_point(alpha = 0.3) + scale_color_viridis_c(option = “viridis”) +
# Thêm điểm dự báo từ prediction_data với Credit_Score làm kích thước geom_point(data = prediction_data, aes(x = Card_Fee, y = Annual_Income, color = Predicted_Revenue, size = Credit_Score), shape = 15) +
scale_size_continuous(name = “Credit Score”) + labs( title = “Dự báo Revenue theo Card_Fee, Annual_Income và Credit_Score”, x = “Card Fee (USD)”, y = “Annual Income (USD)”, color = “Dự báo Revenue” ) + theme( axis.text.x = element_text(family = “Arial”, size = 14), axis.text.y = element_text(family = “Arial”, size = 14, angle = 0), plot.title = element_text(family = “Arial”, size = 18,hjust = 0.5), axis.title = element_text(family = “Arial”, size = 18) )
library(plotly)
plot_ly( prediction_data, x = ~Card_Fee, y = ~Annual_Income, z = ~Predicted_Revenue, color = ~Predicted_Revenue, size = ~Credit_Score, type = “scatter3d”, mode = “markers” ) %>% layout( title = “Dự báo Revenue theo Card_Fee, Annual_Income và Credit_Score”, scene = list( xaxis = list(title = “Card_Fee”), yaxis = list(title = “Annual_Income”), zaxis = list(title = “Predicted_Revenue”) ) )