facebook_Df <- read.csv("C:/Users/PC/Documents/R_4DS/Facebook/pseudo_facebook.csv")
str(facebook_Df)
## 'data.frame': 99003 obs. of 15 variables:
## $ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
## $ age : int 14 14 14 14 14 14 13 13 13 13 ...
## $ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
## $ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
## $ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
## $ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
head(facebook_Df)
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382 14 19 1999 11 male 266 0
## 2 1192601 14 2 1999 11 female 6 0
## 3 2083884 14 16 1999 11 male 13 0
## 4 1203168 14 25 1999 12 female 93 0
## 5 1733186 14 4 1999 12 male 82 0
## 6 1524765 14 1 1999 12 male 15 0
## friendships_initiated likes likes_received mobile_likes mobile_likes_received
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## www_likes www_likes_received
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
colnames(facebook_Df)[colSums(is.na(facebook_Df)) > 0]
## [1] "gender" "tenure"
colSums(is.na(facebook_Df))
## userid age dob_day
## 0 0 0
## dob_year dob_month gender
## 0 0 175
## tenure friend_count friendships_initiated
## 2 0 0
## likes likes_received mobile_likes
## 0 0 0
## mobile_likes_received www_likes www_likes_received
## 0 0 0
## Proportion of Missing Data by Columns
cat("\n\n")
cat('\n\n', "+=============== Proportion of Missing Data in the Dataset =================+","\n\n")
##
##
## +=============== Proportion of Missing Data in the Dataset =================+
pctmiss <- colSums(is.na(facebook_Df))/nrow(facebook_Df)
round(pctmiss, 2)
## userid age dob_day
## 0 0 0
## dob_year dob_month gender
## 0 0 0
## tenure friend_count friendships_initiated
## 0 0 0
## likes likes_received mobile_likes
## 0 0 0
## mobile_likes_received www_likes www_likes_received
## 0 0 0
The gender, tenure variable have missing values, let start the Data Cleaning process, first with tenure.
which(is.na(facebook_Df$tenure))
## [1] 35058 63980
# facebook_Df[(facebook_Df$tenure == NA),]$tenure
# facebook_Df[10,]
facebook_Df %>%
filter(is.na(tenure))
## userid age dob_day dob_year dob_month gender tenure friend_count
## 1 1414063 88 14 1925 12 female NA 48
## 2 2095829 103 1 1910 1 female NA 137
## friendships_initiated likes likes_received mobile_likes mobile_likes_received
## 1 41 18 5 17 3
## 2 52 110 74 0 21
## www_likes www_likes_received
## 1 1 2
## 2 110 53
One thing that seems common to both rows, is they are both Females, hence we adjust accordingly.
femaleDf <- facebook_Df %>%
filter(gender == "female")
## Female Tenure Mean
femaleDf_mean <- mean(femaleDf$tenure, na.rm = TRUE) #=> 587.2292
## Fill with mean
facebook_Df[is.na(facebook_Df$tenure),]$tenure <- femaleDf_mean
Here, the data can either be dropped or reassigned with a third value, let us go with the second option.
Notice If you are a beginner tried the method used for the Numeric Variable, you will get Errors. Note how a Factor Variable is replaced.
## make a copy
f_dta <- facebook_Df$gender
## Convert to `as.character`
f_dta <- as.character(f_dta)
## Replace Na
f_dta <- f_dta %>%
replace_na("Unknown")
## Convert Back to Factor
f_dta <- as.factor(f_dta)
## Now we can place it back in the dataset
facebook_Df$gender <- f_dta
## Drop Columns
drp <- c("userid", "dob_day", "dob_year", "dob_month", "likes_received", "mobile_likes", "mobile_likes_received", "www_likes", "www_likes_received")
facebook_Df <- facebook_Df[,!(names(facebook_Df) %in% drp)]
There does not seem to be a definite Outcome Variable, so we will just explore relationships and distributions based on the DataType - Factor / Categorical and Numeric.
Before we go on, we can attempt to view the distribution of the variable, it is not quintessential in an EDA, however the original data description is not available for this Variable and this is a way to educate ourselve more about does not look normal.
factorVars <- select_if(facebook_Df, is.factor)
factorVars %>%
count(gender) %>%
mutate(pct = n / sum(n), pctlabel = paste0(round(pct * 100), "%")) %>%
ggplot(aes(x = gender, y = n)) +
geom_bar(stat = "identity") +
geom_text(aes(label = pctlabel), vjust = -0.5) +
labs(x = "Gender",
y = "Frequency",
title = "Percentage Distribution of Facebook Users by Gender") +
theme_minimal()
Even the Missing Data is not considerable.
## Visual representation of Numerical data using Boxplots and Outliers
library(tidyr)
numericVars <- select_if(facebook_Df, is.numeric)
numericVars <- numericVars %>%
gather(variable, values, 1:dim(numericVars)[2])
numericVars %>%
ggplot() +
geom_boxplot(aes(x = variable, y = values)) +
facet_wrap(~variable, ncol = 6, scales = "free") +
theme(strip.text.x = element_blank(), text = element_text(size = 9))+
theme_minimal()
This Distribution shows how UN-Normal and dispersered the sample population is.
# plot the distribution of salaries
# by rank using jittering
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
num_plot <- function(var) {
plot <- facebook_Df %>%
ggplot(aes(x = factor(gender),
y = {{var}},
color = gender)) +
geom_boxplot(size=1,
outlier.shape = 1,
outlier.color = "black",
outlier.size = 3) +
labs(title = "Bivariate Graph",
subtitle = "Gender by Numeric Variable",
x = "Gender",
y = "") +
theme_minimal() +
theme(legend.position = "none") +
coord_flip()
print(plot)
}
num_plot(age)
num_plot(tenure)
num_plot(friend_count)
num_plot(friendships_initiated)
num_plot(likes)
df_num <- select_if(facebook_Df, is.numeric)
corr <- cor(df_num, use="pairwise")
kable(round(corr,2))
| age | tenure | friend_count | friendships_initiated | likes | |
|---|---|---|---|---|---|
| age | 1.00 | 0.46 | -0.03 | -0.06 | -0.01 |
| tenure | 0.46 | 1.00 | 0.17 | 0.13 | 0.06 |
| friend_count | -0.03 | 0.17 | 1.00 | 0.83 | 0.30 |
| friendships_initiated | -0.06 | 0.13 | 0.83 | 1.00 | 0.29 |
| likes | -0.01 | 0.06 | 0.30 | 0.29 | 1.00 |
library(ggcorrplot)
ggcorrplot(corr,,
hc.order = TRUE,
type = "lower",
lab = TRUE)
Unfortunately because of dearth of info. we can not continue our anlysis using Statistical Models like Linear Regression given the dearth of information, especially on the Outcome Variable.