#Install Packages
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("readr")
#install.packages("lubridate")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("psych")
#install.packages("reshape2")
#install.packages("corrplot")
#install.packages("caret")
#install.packages("scales")
#Libraries
library(tidyverse)
library(psych)
library(ggplot2)
library(reshape2)
library(corrplot)
library(scales)
##Uploading the dataset
rental_income <- read.csv("XYZ LLC 2021 PL.csv")
#EPA
##Head
head(rental_income)
##Structure
str(rental_income)
'data.frame': 28 obs. of 17 variables:
$ X : chr "Ordinary Income/Expense" "Income" "Rental Income" "Unit 1115" ...
$ X1.1.2022 : num NA NA NA 16663 15500 ...
$ X2.1.2022 : num NA NA NA 0 15500 ...
$ X3.1.2022 : num NA NA NA 16663 15500 ...
$ X4.1.2022 : num NA NA NA 16663 15500 ...
$ X5.1.2022 : num NA NA NA 16663 15500 ...
$ X6.1.2022 : num NA NA NA 16663 15500 ...
$ X7.1.2022 : num NA NA NA 16663 15500 ...
$ X8.1.2022 : num NA NA NA 16663 15500 ...
$ X9.1.2022 : num NA NA NA 0 15500 ...
$ X10.1.2022: num NA NA NA 16663 15500 ...
$ X11.1.2022: num NA NA NA 16663 15500 ...
$ X12.1.2022: num NA NA NA 0 15500 ...
$ X1.1.2023 : num NA NA NA 16663 15500 ...
$ X1.2.2023 : num NA NA NA 16663 15500 ...
$ X3.1.2023 : num NA NA NA 16663 15500 ...
$ X4.1.2023 : num NA NA NA 0 15500 ...
#Summary
summary(rental_income)
X X1.1.2022 X2.1.2022 X3.1.2022
Length:28 Min. : 63.78 Min. : 0.00 Min. : 0.00
Class :character 1st Qu.: 307.01 1st Qu.: 63.78 1st Qu.: 63.78
Mode :character Median : 8880.50 Median : 307.01 Median : 307.01
Mean :16403.14 Mean :12960.68 Mean :12879.38
3rd Qu.:16805.15 3rd Qu.:16300.00 3rd Qu.:16300.00
Max. :99680.87 Max. :83018.00 Max. :82448.87
NA's :8 NA's :7 NA's :7
X4.1.2022 X5.1.2022 X6.1.2022 X7.1.2022
Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0.0
1st Qu.: 219 1st Qu.: 138.8 1st Qu.: 138.8 1st Qu.: 138.8
Median : 1500 Median : 3000.0 Median : 3000.0 Median : 3000.0
Mean :13375 Mean :15341.1 Mean :15341.1 Mean :15341.1
3rd Qu.:16663 3rd Qu.:16662.9 3rd Qu.:16662.9 3rd Qu.:16662.9
Max. :84920 Max. :99680.9 Max. :99680.9 Max. :99680.9
NA's :7 NA's :7 NA's :7 NA's :7
X8.1.2022 X9.1.2022 X10.1.2022 X11.1.2022
Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
1st Qu.: 63.78 1st Qu.: 63.78 1st Qu.: 138.8 1st Qu.: 138.8
Median : 307.01 Median : 307.01 Median : 3000.0 Median : 3000.0
Mean :13232.38 Mean :12960.68 Mean :15341.1 Mean :15341.1
3rd Qu.:16662.87 3rd Qu.:16300.00 3rd Qu.:16662.9 3rd Qu.:16662.9
Max. :84919.87 Max. :83018.00 Max. :99680.9 Max. :99680.9
NA's :7 NA's :7 NA's :7 NA's :7
X12.1.2022 X1.1.2023 X1.2.2023 X3.1.2023
Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.00
1st Qu.: 63.78 1st Qu.: 282.8 1st Qu.: 138.8 1st Qu.: 63.78
Median : 307.01 Median : 2950.0 Median : 3000.0 Median : 307.01
Mean :12960.68 Mean :13293.5 Mean :15341.1 Mean :13232.38
3rd Qu.:16300.00 3rd Qu.:16662.9 3rd Qu.:16662.9 3rd Qu.:16662.87
Max. :83018.00 Max. :83380.9 Max. :99680.9 Max. :84919.87
NA's :7 NA's :7 NA's :7 NA's :7
X4.1.2023
Min. : 0
1st Qu.: 219
Median : 1500
Mean :13104
3rd Qu.:16300
Max. :83018
NA's :7
##Omit NA values
clean_rental_income <- na.omit(rental_income)
#view(clean_rental_income)
#check the current column names
print(colnames(clean_rental_income))
[1] "X" "X1.1.2022" "X2.1.2022" "X3.1.2022" "X4.1.2022" "X5.1.2022"
[7] "X6.1.2022" "X7.1.2022" "X8.1.2022" "X9.1.2022" "X10.1.2022" "X11.1.2022"
[13] "X12.1.2022" "X1.1.2023" "X1.2.2023" "X3.1.2023" "X4.1.2023"
Col_headers <- seq(as.Date("2022-01-01"), as.Date("2023-04-01"), by = "month")
# Assign dates as column names from the second column onwards
names(clean_rental_income)[-1] <- format(Col_headers, "%B %Y")
print(colnames(clean_rental_income))
[1] "X" "January 2022" "February 2022" "March 2022"
[5] "April 2022" "May 2022" "June 2022" "July 2022"
[9] "August 2022" "September 2022" "October 2022" "November 2022"
[13] "December 2022" "January 2023" "February 2023" "March 2023"
[17] "April 2023"
total_rental_income_monthly <- clean_rental_income %>%
filter(grepl("Total Rental Income", clean_rental_income[[1]])) %>% # Adjust the column index if necessary
select(-1) # Remove the category column to focus on numerical data
# Transpose for better readability
monthly_totals <- colSums(total_rental_income_monthly, na.rm = TRUE)
monthly_totals_df <- data.frame(Month = names(monthly_totals), Total = as.numeric(monthly_totals), row.names = NULL)
print(monthly_totals_df)
#Bar Chart with the Total Rental Income per month
ggplot(monthly_totals_df, aes(x = Month, y = Total)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(title = "Monthly Total Rental Income",
x = "Month",
y = "Total Income ($)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability
# Define the specific order of the months
months_order <- c("January 2022", "February 2022", "March 2022", "April 2022",
"May 2022", "June 2022", "July 2022", "August 2022", "September 2022",
"October 2022", "November 2022", "December 2022", "January 2023",
"February 2023", "March 2023", "April 2023")
# Ensure that 'Month' is converted to a factor and ordered correctly
monthly_totals_df$Month <- factor(monthly_totals_df$Month, levels = months_order)
# Plotting the bar chart with the months in the specified order
ggplot(monthly_totals_df, aes(x = Month, y = Total)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = paste0("$", format(Total, big.mark=","))), # Format numbers with commas
position = position_stack(vjust = 0.5), # Adjust vertical position to be inside the bars
color = "white", size = 3.5, angle = 90) + # Set text color and size, rotate text vertically
theme_minimal() +
labs(title = "Monthly Total Rental Income",
x = "Month",
y = "Total Income ($)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability
# Filter the data for 'Net Income'
net_income_data <- clean_rental_income %>%
filter(grepl("Net Income", clean_rental_income[[1]])) %>% # Adjust the column index if necessary
select(-1) # Remove the category column to focus on numerical data
print(net_income_data)
# Prepare data for plotting
net_income_data <- tibble(
Month = months_order,
Net_Income = as.numeric(net_income_data[1, ])
)
print(net_income_data)
# Convert 'Month' into an ordered factor based on the months_order
net_income_data$Month <- factor(net_income_data$Month, levels = months_order)
# Plotting the line graph
plot <- ggplot(net_income_data, aes(x = Month, y = Net_Income)) +
geom_line(color = "dodgerblue", size = 1.5) + # Ensure line connects the points
geom_point(color = "darkblue", size = 3.5) + # Points on the line
geom_smooth(method = "lm", color = "red", se = FALSE, aes(group = 1)) + # Add a linear trendline
theme_minimal() +
scale_y_continuous(labels = dollar_format(prefix = "$", suffix = ""),
breaks = pretty_breaks(n = 10)) +
labs(title = "Monthly Net Income",
x = "Month",
y = "Net Income ($)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)
# Extract column names (months), which are the levels of the 'Month' factor
month_names <- levels(net_income_data$Month) # This assumes 'Month' is a factor
# Extract values for Net Income
total_income <- net_income_data$Net_Income
# Create the line plot with nicer aesthetics but without the y-axis
plot(1:length(month_names), total_income, type = "o",
xlab = "Months", ylab = "", # Remove ylab here
main = "Net Income per Month",
col = "blue", # Line color
pch = 16, # Point shape
lwd = 2, # Line width
ylim = c(0, max(total_income) * 1.1), # Adjust y-axis limits
xaxt = "n", # Disable x-axis labels
yaxt = "n") # Disable y-axis to customize later
# Add month labels on the x-axis with rotated labels
axis(1, at = 1:length(month_names), labels = month_names, las = 2, cex.axis = 0.8)
# Customize y-axis with dollar sign
y_at <- pretty(range(c(0, max(total_income) * 1.1))) # Get pretty breaks for y-axis
axis(2, at = y_at, labels = paste("$", format(y_at, big.mark = ",", scientific = FALSE), sep = ""), las = 1, cex.axis = 0.8)
# Add gridlines
grid(nx = NULL, ny = NULL) # Add horizontal and vertical grid lines
# Add a legend
legend("topright",
legend = c("Net Income"),
col = "blue",
lty = 1,
pch = 16,
cex = 0.8,
bg = "white")
#Analysis of Expense Lines
transposed_data <- t(clean_rental_income)
#view(transposed_data)
colnames(transposed_data) <- transposed_data[1, ]
transposed_data <- transposed_data[-1, ]
#view(transposed_data)
# Select only the rows corresponding to expenses
rental_expenses <- transposed_data[, c("Bright Star Credit Union", "Bank Charges - Other", "Total Commission", "Filing Fees","Total Electricity", "Total Utilities")]
# Convert the selected columns to numeric
rental_expenses <- apply(rental_expenses, 2, as.numeric)
# Calculate the sum of each expense column
total_expenses <- colSums(rental_expenses)
# Create a dataframe with the expense names and their totals
expenses_df <- data.frame(expense = names(total_expenses), total = total_expenses)
##Bar Chart Expenses
# Create the bar plot with enhancements
ggplot(expenses_df, aes(x = expense, y = total)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = format(total, big.mark = ",")), # Adding formatted labels inside bars
position = position_stack(vjust = 0.5), # Center labels vertically in bars
color = "black", size = 3.5) + # Set text color and size
labs(title = "Total Expenses", x = "Expense", y = "Total") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for better readability
axis.text.y = element_text(color = "gray20")) # Enhance y-axis labels readability
##Pie Chart Distribution of Expenses
# Calculate the percentages for the pie chart labels
expenses_df$percentage <- expenses_df$total / sum(expenses_df$total) * 100
# Create the pie chart
ggplot(expenses_df, aes(x = "", y = total, fill = expense)) +
geom_bar(width = 1, stat = "identity") + # Use geom_bar and set width to 1 for pie chart
coord_polar(theta = "y") + # Convert bar chart to pie chart using polar coordinates
geom_text(aes(label = paste0(round(percentage, 1), "%")), position = position_stack(vjust = 0.5),
color = "white", size = 4) + # Add percentage labels inside each slice
labs(title = "Distribution of Expenses", x = NULL, y = NULL, fill = "Expense Type") +
theme_void() + # Remove axis and grid lines
theme(legend.position = "right", # Adjust legend position
legend.title = element_text(size = 10), # Legend title size
legend.text = element_text(size = 8)) # Legend text size
#install.packages("randomForest")
# Fit multiple linear regression model
multiple_lm <- lm(`Net Income` ~ `Total Rental Income` + `Bright Star Credit Union` + `Bank Charges - Other` + `Total Commission` + `Filing Fees` + `Total Electricity` + `Total Utilities` + `Depreciation Expense`, data = model_data)
# Print summary of the model
summary(multiple_lm)
Warning: essentially perfect fit: summary may be unreliable
Call:
lm(formula = `Net Income` ~ `Total Rental Income` + `Bright Star Credit Union` +
`Bank Charges - Other` + `Total Commission` + `Filing Fees` +
`Total Electricity` + `Total Utilities` + `Depreciation Expense`,
data = model_data)
Residuals:
Min 1Q Median 3Q Max
-5.109e-12 -1.523e-12 0.000e+00 5.740e-14 7.002e-12
Coefficients: (6 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.961e+04 3.731e-12 1.598e+16 <2e-16 ***
`Total Rental Income`83018.00 5.691e+02 4.193e-12 1.357e+14 <2e-16 ***
`Total Rental Income`83380.87 9.320e+02 6.262e-12 1.488e+14 <2e-16 ***
`Total Rental Income`84919.87 2.471e+03 4.345e-12 5.686e+14 <2e-16 ***
`Total Rental Income`99680.87 1.723e+04 4.017e-12 4.290e+15 <2e-16 ***
`Bright Star Credit Union` NA NA NA NA
`Bank Charges - Other` NA NA NA NA
`Total Commission` -1.000e+00 1.143e-15 -8.746e+14 <2e-16 ***
`Filing Fees` NA NA NA NA
`Total Electricity` NA NA NA NA
`Total Utilities` NA NA NA NA
`Depreciation Expense` NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.731e-12 on 10 degrees of freedom
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 1.536e+31 on 5 and 10 DF, p-value: < 2.2e-16
##Multiple Linear Regression
# Fit multiple linear regression model
multiple_lm <- lm(`Net Income` ~ ., data = model_data)
# Print summary of the model
summary(multiple_lm)
Warning: essentially perfect fit: summary may be unreliable
Call:
lm(formula = `Net Income` ~ ., data = model_data)
Residuals:
Min 1Q Median 3Q Max
-5.109e-12 -1.523e-12 0.000e+00 5.740e-14 7.002e-12
Coefficients: (6 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.961e+04 3.731e-12 1.598e+16 <2e-16 ***
`Total Rental Income`83018.00 5.691e+02 4.193e-12 1.357e+14 <2e-16 ***
`Total Rental Income`83380.87 9.320e+02 6.262e-12 1.488e+14 <2e-16 ***
`Total Rental Income`84919.87 2.471e+03 4.345e-12 5.686e+14 <2e-16 ***
`Total Rental Income`99680.87 1.723e+04 4.017e-12 4.290e+15 <2e-16 ***
`Bright Star Credit Union` NA NA NA NA
`Bank Charges - Other` NA NA NA NA
`Total Commission` -1.000e+00 1.143e-15 -8.746e+14 <2e-16 ***
`Filing Fees` NA NA NA NA
`Total Electricity` NA NA NA NA
`Total Utilities` NA NA NA NA
`Depreciation Expense` NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.731e-12 on 10 degrees of freedom
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 1.536e+31 on 5 and 10 DF, p-value: < 2.2e-16
# Remove rows with missing values
numeric_data <- na.omit(numeric_data)
# Calculate the correlation matrix
correlation_matrix <- cor(numeric_data)
Warning: the standard deviation is zero
# Print the correlation matrix
#print(correlation_matrix)
#Check for missing values in the relevant columns
missing_values <- sapply(correlation_data, function(x) sum(is.na(x)))
# Print the number of missing values for each column
print(missing_values)
Total Rental Income Bright Star Credit Union Bank Charges - Other
0 0 0
Total Commission Filing Fees Total Electricity
0 0 0
Total Utilities Depreciation Expense Net Income
0 0 0