Pseudo_Facebook

facebook_Df <- read.csv("C:/Users/PC/Documents/R_4DS/Facebook/pseudo_facebook.csv")

str(facebook_Df)

## 'data.frame':    99003 obs. of  15 variables:
##  $ userid               : int  2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
##  $ age                  : int  14 14 14 14 14 14 13 13 13 13 ...
##  $ dob_day              : int  19 2 16 25 4 1 14 4 1 2 ...
##  $ dob_year             : int  1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
##  $ dob_month            : int  11 11 11 12 12 12 1 1 1 2 ...
##  $ gender               : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
##  $ tenure               : int  266 6 13 93 82 15 12 0 81 171 ...
##  $ friend_count         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ friendships_initiated: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes_received       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes_received: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes_received   : int  0 0 0 0 0 0 0 0 0 0 ...

head(facebook_Df)

##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 2094382  14      19     1999        11   male    266            0
## 2 1192601  14       2     1999        11 female      6            0
## 3 2083884  14      16     1999        11   male     13            0
## 4 1203168  14      25     1999        12 female     93            0
## 5 1733186  14       4     1999        12   male     82            0
## 6 1524765  14       1     1999        12   male     15            0
##   friendships_initiated likes likes_received mobile_likes mobile_likes_received
## 1                     0     0              0            0                     0
## 2                     0     0              0            0                     0
## 3                     0     0              0            0                     0
## 4                     0     0              0            0                     0
## 5                     0     0              0            0                     0
## 6                     0     0              0            0                     0
##   www_likes www_likes_received
## 1         0                  0
## 2         0                  0
## 3         0                  0
## 4         0                  0
## 5         0                  0
## 6         0                  0

colnames(facebook_Df)[colSums(is.na(facebook_Df)) > 0]

## [1] "gender" "tenure"

colSums(is.na(facebook_Df))

##                userid                   age               dob_day 
##                     0                     0                     0 
##              dob_year             dob_month                gender 
##                     0                     0                   175 
##                tenure          friend_count friendships_initiated 
##                     2                     0                     0 
##                 likes        likes_received          mobile_likes 
##                     0                     0                     0 
## mobile_likes_received             www_likes    www_likes_received 
##                     0                     0                     0

## Proportion of Missing Data by Columns
cat("\n\n")

cat('\n\n', "+===============  Proportion of Missing Data in the Dataset  =================+","\n\n")

## 
## 
##  +===============  Proportion of Missing Data in the Dataset  =================+

pctmiss <- colSums(is.na(facebook_Df))/nrow(facebook_Df)
round(pctmiss, 2)

##                userid                   age               dob_day 
##                     0                     0                     0 
##              dob_year             dob_month                gender 
##                     0                     0                     0 
##                tenure          friend_count friendships_initiated 
##                     0                     0                     0 
##                 likes        likes_received          mobile_likes 
##                     0                     0                     0 
## mobile_likes_received             www_likes    www_likes_received 
##                     0                     0                     0

Data Cleaning

The gender, tenure variable have missing values, let start the Data Cleaning process, first with tenure.

which(is.na(facebook_Df$tenure))

## [1] 35058 63980

Tenure

# facebook_Df[(facebook_Df$tenure == NA),]$tenure
# facebook_Df[10,]

facebook_Df %>%
  filter(is.na(tenure))

##    userid age dob_day dob_year dob_month gender tenure friend_count
## 1 1414063  88      14     1925        12 female     NA           48
## 2 2095829 103       1     1910         1 female     NA          137
##   friendships_initiated likes likes_received mobile_likes mobile_likes_received
## 1                    41    18              5           17                     3
## 2                    52   110             74            0                    21
##   www_likes www_likes_received
## 1         1                  2
## 2       110                 53

One thing that seems common to both rows, is they are both Females, hence we adjust accordingly.

femaleDf <- facebook_Df %>%
  filter(gender == "female") 

## Female Tenure Mean
femaleDf_mean <- mean(femaleDf$tenure, na.rm = TRUE) #=> 587.2292

## Fill with mean
facebook_Df[is.na(facebook_Df$tenure),]$tenure <- femaleDf_mean

Gender

Here, the data can either be dropped or reassigned with a third value, let us go with the second option.

Notice If you are a beginner tried the method used for the Numeric Variable, you will get Errors. Note how a Factor Variable is replaced.

## make a copy
f_dta <- facebook_Df$gender
## Convert to `as.character`
f_dta <- as.character(f_dta)
## Replace Na
f_dta <- f_dta %>%
          replace_na("Unknown")
## Convert Back to Factor
f_dta <- as.factor(f_dta)


## Now we can place it back in the dataset
facebook_Df$gender <- f_dta

Miscelleneous

## Drop Columns
drp <- c("userid", "dob_day", "dob_year", "dob_month", "likes_received", "mobile_likes", "mobile_likes_received", "www_likes", "www_likes_received")

facebook_Df <- facebook_Df[,!(names(facebook_Df) %in% drp)]

There does not seem to be a definite Outcome Variable, so we will just explore relationships and distributions based on the DataType - Factor / Categorical and Numeric.

Before we go on, we can attempt to view the distribution of the variable, it is not quintessential in an EDA, however the original data description is not available for this Variable and this is a way to educate ourselve more about does not look normal.

Distributions

Univariate Analysis: Factor Variables

factorVars <- select_if(facebook_Df, is.factor)

factorVars %>%
  count(gender) %>%
  mutate(pct = n / sum(n), pctlabel = paste0(round(pct * 100), "%")) %>%
  ggplot(aes(x = gender, y = n)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = pctlabel), vjust = -0.5) +
  labs(x = "Gender", 
       y = "Frequency", 
       title = "Percentage Distribution of Facebook Users by Gender") +
  theme_minimal()

Even the Missing Data is not considerable.

Univariate Analysis: Numeric Variables

## Visual representation of Numerical data using Boxplots and Outliers
library(tidyr)

numericVars <- select_if(facebook_Df, is.numeric)

numericVars <- numericVars %>% 
                gather(variable, values, 1:dim(numericVars)[2])

numericVars %>% 
  ggplot() +
  geom_boxplot(aes(x = variable, y = values)) +
  facet_wrap(~variable, ncol = 6, scales = "free") +
  theme(strip.text.x = element_blank(), text = element_text(size = 9))+
  theme_minimal()

This Distribution shows how UN-Normal and dispersered the sample population is.

Bi-Variate Relationship: Numeric Variables by Factor Variables

# plot the distribution of salaries 
# by rank using jittering
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

num_plot <- function(var) {
  plot <- facebook_Df %>%
  ggplot(aes(x = factor(gender), 
           y = {{var}}, 
           color = gender)) +
  geom_boxplot(size=1,
               outlier.shape = 1,
               outlier.color = "black",
               outlier.size  = 3) +
  labs(title = "Bivariate Graph",
       subtitle = "Gender by Numeric Variable",
       x = "Gender",
       y = "") +
  theme_minimal() +
  theme(legend.position = "none") +
  coord_flip()
  
  print(plot)
}

num_plot(age)

num_plot(tenure)

num_plot(friend_count)

num_plot(friendships_initiated)

num_plot(likes)

Bi-Variate Relationship: Numeric Variables by Numeric Variables

Pairwise Correlation

df_num <- select_if(facebook_Df, is.numeric)

corr <- cor(df_num, use="pairwise")
kable(round(corr,2))

	age	tenure	friend_count	friendships_initiated	likes
age	1.00	0.46	-0.03	-0.06	-0.01
tenure	0.46	1.00	0.17	0.13	0.06
friend_count	-0.03	0.17	1.00	0.83	0.30
friendships_initiated	-0.06	0.13	0.83	1.00	0.29
likes	-0.01	0.06	0.30	0.29	1.00

Visualise Correlation

library(ggcorrplot)

ggcorrplot(corr,, 
           hc.order = TRUE, 
           type = "lower",
           lab = TRUE)

Unfortunately because of dearth of info. we can not continue our anlysis using Statistical Models like Linear Regression given the dearth of information, especially on the Outcome Variable.