Required packages

Below are the packages required to this Report.

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)
library(forecast)
library(mlr)
## Loading required package: ParamHelpers
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:Hmisc':
## 
##     impute

Executive Summary

  1. Reading .CSV files
  2. Check the head,dimensions and structures of datasets imported.
  3. Merge the datasets and check the datatypes of the columns
  4. Applying any datatype convesions as necessary(Factor convesions, date conversions)
  5. Function for removing redundant columns is made and applies
  6. Did checks for whether the data is tidy or not(In our case, the data was tidy)
  7. Checking for Na and removing them if present
  8. Out of the huge number of columns, we subsetted the necessary columns where outliers detection and removal could be used
  9. Outliers were removed for the subsetted columns by using capping and transformations
  10. Mutliple transformations were applied to the selected variable and the best transformation which made the distribution most similar to normal distribution was identified

Data

Dataset Summary

events_description <- read.csv("C:/Users/jeeva/Downloads/meetups-data-from-meetupcom/events.csv")
groups_description <- read_csv("C:/Users/jeeva/Downloads/meetups-data-from-meetupcom/groups.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   group_id = col_double(),
##   category_id = col_double(),
##   city_id = col_double(),
##   group_photo.photo_id = col_double(),
##   lat = col_double(),
##   lon = col_double(),
##   members = col_double(),
##   organizer.member_id = col_double(),
##   organizer.photo.photo_id = col_double(),
##   rating = col_double(),
##   utc_offset = col_double()
## )
## See spec(...) for full column specifications.
events_description
groups_description

Below steps are performed in this block of code

Read .CSV files

events <- read.csv("C:/Users/jeeva/Downloads/meetups-data-from-meetupcom/events.csv")
groups <- read_csv("C:/Users/jeeva/Downloads/meetups-data-from-meetupcom/groups.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   group_id = col_double(),
##   category_id = col_double(),
##   city_id = col_double(),
##   group_photo.photo_id = col_double(),
##   lat = col_double(),
##   lon = col_double(),
##   members = col_double(),
##   organizer.member_id = col_double(),
##   organizer.photo.photo_id = col_double(),
##   rating = col_double(),
##   utc_offset = col_double()
## )
## See spec(...) for full column specifications.
head(events,10)
head(groups,10)

Check the head and dimentions of datasets imported.

dim(events)
## [1] 5807   47
dim(groups)
## [1] 16330    35

Merge the dataframes to create a new dataframe with the name df.

df <- merge(events, groups, by = "group_id", all= FALSE)
head(df,10)
  • Check the dimentions of new dataframes
dim(df)
## [1] 5807   81

The number of columns are huge which will make the subsetting step necessary ## Understanding the data

  • Structure of dataframe is checked
str(df)
## 'data.frame':    5807 obs. of  81 variables:
##  $ group_id                    : int  23713 54691 54691 54691 54691 54691 54691 54691 54691 54691 ...
##  $ event_id                    : Factor w/ 5807 levels "153868222","184167702",..: 545 5349 5356 5353 5346 5347 5357 5352 176 5355 ...
##  $ created.x                   : Factor w/ 770 levels "1/03/2015 5:41",..: 560 379 379 379 379 379 379 379 593 379 ...
##  $ duration                    : int  7200 10800 10800 10800 10800 10800 10800 10800 14400 10800 ...
##  $ event_url                   : Factor w/ 5807 levels "https://www.meetup.com/1001-Books-SF/events/240930871/",..: 530 5527 5535 5533 5524 5525 5536 5530 5532 5534 ...
##  $ fee.accepts                 : Factor w/ 4 levels "cash","others",..: 1 2 2 2 2 2 2 2 2 2 ...
##  $ fee.amount                  : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ fee.currency                : Factor w/ 2 levels "not_found","USD": 2 1 1 1 1 1 1 1 1 1 ...
##  $ fee.description             : Factor w/ 1 level "per person": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fee.label                   : Factor w/ 2 levels "price","Price": 1 2 2 2 2 2 2 2 2 2 ...
##  $ fee.required                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ group.created               : Factor w/ 341 levels "1/01/2011 20:14",..: 332 154 154 154 154 154 154 154 154 154 ...
##  $ group.group_lat             : num  40.8 40.8 40.8 40.8 40.8 ...
##  $ group.group_lon             : num  -74 -74 -74 -74 -74 ...
##  $ group.join_mode             : Factor w/ 2 levels "approval","open": 2 2 2 2 2 2 2 2 2 2 ...
##  $ group.name                  : Factor w/ 341 levels "1001 Books You Must Read Before You Die - San Francisco",..: 307 279 279 279 279 279 279 279 279 279 ...
##  $ group.urlname               : Factor w/ 341 levels "1001-Books-SF",..: 38 315 315 315 315 315 315 315 315 315 ...
##  $ group.who                   : Factor w/ 202 levels "$Million Dollar Leaders",..: 29 180 180 180 180 180 180 180 180 180 ...
##  $ headcount                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ how_to_find_us              : Factor w/ 194 levels "#SocialMediaNYC #DigitalMarketingNYC #socialmediamarketingnyc #holisticspeakersnyc #healthyageingnyc #nyc #heal"| __truncated__,..: 76 112 112 112 112 112 112 112 127 112 ...
##  $ maybe_rsvp_count            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ event_name                  : Factor w/ 613 levels "","$3.00 Happy Hr - NYC Small Business Owners, Salespeople Consultants, Networkers",..: 407 396 396 396 396 396 396 396 85 396 ...
##  $ photo_url                   : Factor w/ 1 level "not_found": 1 1 1 1 1 1 1 1 1 1 ...
##  $ rating.average              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rating.count                : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ rsvp_limit                  : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ event_status                : Factor w/ 1 level "upcoming": 1 1 1 1 1 1 1 1 1 1 ...
##  $ event_time                  : Factor w/ 3438 levels "1/01/2018 18:00",..: 2664 2018 1868 1941 2221 2005 1656 2294 2585 2189 ...
##  $ updated                     : Factor w/ 892 levels "1/03/2015 5:59",..: 644 715 208 208 204 204 208 756 707 208 ...
##  $ utc_offset.x                : int  -14400 -18000 -14400 -14400 -18000 -18000 -14400 -14400 -14400 -14400 ...
##  $ venue.address_1             : Factor w/ 450 levels "1 Bryant Park",..: 311 287 287 287 287 287 287 287 134 287 ...
##  $ venue.address_2             : Factor w/ 18 levels "(@ 156th Street)",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ venue.city                  : Factor w/ 10 levels "Chicago","Chicago, IL",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ venue.country               : Factor w/ 2 levels "us","US": 2 1 1 1 1 1 1 1 1 1 ...
##  $ venue_id                    : int  25572693 24102474 24102474 24102474 24102474 24102474 24102474 24102474 24776027 24102474 ...
##  $ venue.lat                   : num  40.7 40.7 40.7 40.7 40.7 ...
##  $ venue.localized_country_name: Factor w/ 1 level "USA": 1 1 1 1 1 1 1 1 1 1 ...
##  $ venue.lon                   : num  -74 -74 -74 -74 -74 ...
##  $ venue.name                  : Factor w/ 440 levels "1 California St #200",..: 214 374 374 374 374 374 374 374 334 374 ...
##  $ venue.phone                 : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ venue.repinned              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ venue.state                 : Factor w/ 6 levels "CA","Il","IL",..: 5 6 6 6 6 6 6 6 6 6 ...
##  $ venue.zip                   : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ visibility.x                : Factor w/ 1 level "public": 1 1 1 1 1 1 1 1 1 1 ...
##  $ waitlist_count              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ why                         : Factor w/ 1 level "not_found": 1 1 1 1 1 1 1 1 1 1 ...
##  $ yes_rsvp_count              : int  5 2 2 2 2 2 2 2 1 2 ...
##  $ category_id                 : num  18 2 2 2 2 2 2 2 2 2 ...
##  $ category.name               : chr  "book clubs" "career/business" "career/business" "career/business" ...
##  $ category.shortname          : chr  "book-clubs" "career-business" "career-business" "career-business" ...
##  $ city_id                     : num  10001 10001 10001 10001 10001 ...
##  $ city                        : chr  "New York" "New York" "New York" "New York" ...
##  $ country                     : chr  "US" "US" "US" "US" ...
##  $ created.y                   : chr  "8/10/2002 17:22" "20/12/2002 16:20" "20/12/2002 16:20" "20/12/2002 16:20" ...
##  $ group_photo.base_url        : chr  "https://secure.meetupstatic.com" "https://secure.meetupstatic.com" "https://secure.meetupstatic.com" "https://secure.meetupstatic.com" ...
##  $ group_photo.highres_link    : chr  "https://secure.meetupstatic.com/photos/event/e/6/6/0/highres_1618976.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/highres_114215642.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/highres_114215642.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/highres_114215642.jpeg" ...
##  $ group_photo.photo_id        : num  1.62e+06 1.14e+08 1.14e+08 1.14e+08 1.14e+08 ...
##  $ group_photo.photo_link      : chr  "https://secure.meetupstatic.com/photos/event/e/6/6/0/600_1618976.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/600_114215642.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/600_114215642.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/600_114215642.jpeg" ...
##  $ group_photo.thumb_link      : chr  "https://secure.meetupstatic.com/photos/event/e/6/6/0/thumb_1618976.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/thumb_114215642.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/thumb_114215642.jpeg" "https://secure.meetupstatic.com/photos/event/8/b/3/a/thumb_114215642.jpeg" ...
##  $ group_photo.type            : chr  "event" "event" "event" "event" ...
##  $ join_mode                   : chr  "open" "open" "open" "open" ...
##  $ lat                         : num  40.8 40.8 40.8 40.8 40.8 ...
##  $ link                        : chr  "https://www.meetup.com/bookcrossing-30/" "https://www.meetup.com/Zorkas-entrepreneur-success/" "https://www.meetup.com/Zorkas-entrepreneur-success/" "https://www.meetup.com/Zorkas-entrepreneur-success/" ...
##  $ lon                         : num  -74 -74 -74 -74 -74 ...
##  $ members                     : num  347 1426 1426 1426 1426 ...
##  $ group_name                  : chr  "The NYC Bookcrossing Meetup" "SUCCESS! An Entrepreneurial Meetup Group" "SUCCESS! An Entrepreneurial Meetup Group" "SUCCESS! An Entrepreneurial Meetup Group" ...
##  $ organizer.member_id         : num  3198499 82494832 82494832 82494832 82494832 ...
##  $ organizer.name              : chr  "Hazel" "Zorka Kovacevich" "Zorka Kovacevich" "Zorka Kovacevich" ...
##  $ organizer.photo.base_url    : chr  "https://secure.meetupstatic.com" "https://secure.meetupstatic.com" "https://secure.meetupstatic.com" "https://secure.meetupstatic.com" ...
##  $ organizer.photo.highres_link: chr  "https://secure.meetupstatic.com/photos/member/5/9/8/a/highres_36382922.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/highres_100605292.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/highres_100605292.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/highres_100605292.jpeg" ...
##  $ organizer.photo.photo_id    : num  3.64e+07 1.01e+08 1.01e+08 1.01e+08 1.01e+08 ...
##  $ organizer.photo.photo_link  : chr  "https://secure.meetupstatic.com/photos/member/5/9/8/a/member_36382922.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/member_100605292.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/member_100605292.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/member_100605292.jpeg" ...
##  $ organizer.photo.thumb_link  : chr  "https://secure.meetupstatic.com/photos/member/5/9/8/a/thumb_36382922.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/thumb_100605292.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/thumb_100605292.jpeg" "https://secure.meetupstatic.com/photos/member/b/0/e/c/thumb_100605292.jpeg" ...
##  $ organizer.photo.type        : chr  "member" "member" "member" "member" ...
##  $ rating                      : num  3.98 4.33 4.33 4.33 4.33 4.33 4.33 4.33 4.33 4.33 ...
##  $ state                       : chr  "NY" "NY" "NY" "NY" ...
##  $ timezone                    : chr  "US/Eastern" "US/Eastern" "US/Eastern" "US/Eastern" ...
##  $ urlname                     : chr  "bookcrossing-30" "Zorkas-entrepreneur-success" "Zorkas-entrepreneur-success" "Zorkas-entrepreneur-success" ...
##  $ utc_offset.y                : num  -14400 -14400 -14400 -14400 -14400 -14400 -14400 -14400 -14400 -14400 ...
##  $ visibility.y                : chr  "public" "public" "public" "public" ...
##  $ who                         : chr  "BookCrossers" "Successful Entrepreneurs" "Successful Entrepreneurs" "Successful Entrepreneurs" ...
  • The datatypes of created.x, group.created were changed to Date format
df$created.x <- as.Date(df$created.x)
df$group.created <- as.Date(df$group.created)

The variables event_time, updated, created.y which were of type integer, integer and double respectively are converted to type Date.

df$event_time <- as.Date(df$event_time)
df$updated <- as.Date(df$updated)
df$created.y <- as.Date(df$created.y)
  • fee.required and venue.repinned columns are converted into factors with different levels.
df$fee.required <- df$fee.required %>% factor(levels = c(0,1),
                                              labels = c('No','Yes'))
df$venue.repinned <- df$venue.repinned %>% factor(levels = c(0,1),labels = c('No','Yes'))

-Variables venue.zip ,category.name ,category.shortname etc are converted into factors.

df$venue.zip <- as.factor(df$venue.zip)
df$category.name <- as.factor(df$category.name)
df$category.shortname <- as.factor(df$category.shortname)


df$group_photo.base_url <- as.factor(df$group_photo.base_url)
df$group_photo.highres_link <- as.factor(df$group_photo.highres_link)
df$group_photo.photo_link <- as.factor(df$group_photo.photo_link)
df$group_photo.thumb_link <- as.factor(df$group_photo.thumb_link)
df$group_photo.type <- as.factor(df$group_photo.type)
df$join_mode <- as.factor(df$join_mode)
df$link <- as.factor(df$link)
df$group_name <- as.factor(df$group_name)
df$organizer.name <- as.factor(df$organizer.name)
df$organizer.photo.base_url <- as.factor(df$organizer.photo.base_url)
df$organizer.photo.highres_link <- as.factor(df$organizer.photo.highres_link)
df$organizer.photo.photo_link <- as.factor(df$organizer.photo.photo_link)
df$organizer.photo.thumb_link <- as.factor(df$organizer.photo.thumb_link)
df$organizer.photo.type <- as.factor(df$organizer.photo.type)
df$state <- as.factor(df$state)
df$timezone <- as.factor(df$timezone)
df$urlname <- as.factor(df$urlname)
df$visibility.y <- as.factor(df$visibility.y)
df$who <- as.factor(df$who)
df$city <- as.factor(df$city)
df$country <- as.factor(df$country)

Changing venue.country variable to upper

table(df$venue.country)
## 
##   us   US 
## 5312  495
df$venue.country <- toupper(df$venue.country)

Changing the data type of venue to factor

df$venue.country <- as.factor(df$venue.country)
table(df$venue.country)
## 
##   US 
## 5807
  • Structure of dataframe is checked after type conversion operations.
str(df)
## 'data.frame':    5807 obs. of  81 variables:
##  $ group_id                    : int  23713 54691 54691 54691 54691 54691 54691 54691 54691 54691 ...
##  $ event_id                    : Factor w/ 5807 levels "153868222","184167702",..: 545 5349 5356 5353 5346 5347 5357 5352 176 5355 ...
##  $ created.x                   : Date, format: "0025-10-20" "0020-08-20" ...
##  $ duration                    : int  7200 10800 10800 10800 10800 10800 10800 10800 14400 10800 ...
##  $ event_url                   : Factor w/ 5807 levels "https://www.meetup.com/1001-Books-SF/events/240930871/",..: 530 5527 5535 5533 5524 5525 5536 5530 5532 5534 ...
##  $ fee.accepts                 : Factor w/ 4 levels "cash","others",..: 1 2 2 2 2 2 2 2 2 2 ...
##  $ fee.amount                  : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ fee.currency                : Factor w/ 2 levels "not_found","USD": 2 1 1 1 1 1 1 1 1 1 ...
##  $ fee.description             : Factor w/ 1 level "per person": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fee.label                   : Factor w/ 2 levels "price","Price": 1 2 2 2 2 2 2 2 2 2 ...
##  $ fee.required                : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ group.created               : Date, format: "0008-10-20" "0020-12-20" ...
##  $ group.group_lat             : num  40.8 40.8 40.8 40.8 40.8 ...
##  $ group.group_lon             : num  -74 -74 -74 -74 -74 ...
##  $ group.join_mode             : Factor w/ 2 levels "approval","open": 2 2 2 2 2 2 2 2 2 2 ...
##  $ group.name                  : Factor w/ 341 levels "1001 Books You Must Read Before You Die - San Francisco",..: 307 279 279 279 279 279 279 279 279 279 ...
##  $ group.urlname               : Factor w/ 341 levels "1001-Books-SF",..: 38 315 315 315 315 315 315 315 315 315 ...
##  $ group.who                   : Factor w/ 202 levels "$Million Dollar Leaders",..: 29 180 180 180 180 180 180 180 180 180 ...
##  $ headcount                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ how_to_find_us              : Factor w/ 194 levels "#SocialMediaNYC #DigitalMarketingNYC #socialmediamarketingnyc #holisticspeakersnyc #healthyageingnyc #nyc #heal"| __truncated__,..: 76 112 112 112 112 112 112 112 127 112 ...
##  $ maybe_rsvp_count            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ event_name                  : Factor w/ 613 levels "","$3.00 Happy Hr - NYC Small Business Owners, Salespeople Consultants, Networkers",..: 407 396 396 396 396 396 396 396 85 396 ...
##  $ photo_url                   : Factor w/ 1 level "not_found": 1 1 1 1 1 1 1 1 1 1 ...
##  $ rating.average              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ rating.count                : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ rsvp_limit                  : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ event_status                : Factor w/ 1 level "upcoming": 1 1 1 1 1 1 1 1 1 1 ...
##  $ event_time                  : Date, format: "0030-10-20" "0026-02-20" ...
##  $ updated                     : Date, format: "0025-10-20" "0027-02-20" ...
##  $ utc_offset.x                : int  -14400 -18000 -14400 -14400 -18000 -18000 -14400 -14400 -14400 -14400 ...
##  $ venue.address_1             : Factor w/ 450 levels "1 Bryant Park",..: 311 287 287 287 287 287 287 287 134 287 ...
##  $ venue.address_2             : Factor w/ 18 levels "(@ 156th Street)",..: 14 14 14 14 14 14 14 14 14 14 ...
##  $ venue.city                  : Factor w/ 10 levels "Chicago","Chicago, IL",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ venue.country               : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ venue_id                    : int  25572693 24102474 24102474 24102474 24102474 24102474 24102474 24102474 24776027 24102474 ...
##  $ venue.lat                   : num  40.7 40.7 40.7 40.7 40.7 ...
##  $ venue.localized_country_name: Factor w/ 1 level "USA": 1 1 1 1 1 1 1 1 1 1 ...
##  $ venue.lon                   : num  -74 -74 -74 -74 -74 ...
##  $ venue.name                  : Factor w/ 440 levels "1 California St #200",..: 214 374 374 374 374 374 374 374 334 374 ...
##  $ venue.phone                 : num  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ venue.repinned              : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ venue.state                 : Factor w/ 6 levels "CA","Il","IL",..: 5 6 6 6 6 6 6 6 6 6 ...
##  $ venue.zip                   : Factor w/ 1 level "-1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ visibility.x                : Factor w/ 1 level "public": 1 1 1 1 1 1 1 1 1 1 ...
##  $ waitlist_count              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ why                         : Factor w/ 1 level "not_found": 1 1 1 1 1 1 1 1 1 1 ...
##  $ yes_rsvp_count              : int  5 2 2 2 2 2 2 2 1 2 ...
##  $ category_id                 : num  18 2 2 2 2 2 2 2 2 2 ...
##  $ category.name               : Factor w/ 3 levels "book clubs","career/business",..: 1 2 2 2 2 2 2 2 2 2 ...
##  $ category.shortname          : Factor w/ 3 levels "arts-culture",..: 2 3 3 3 3 3 3 3 3 3 ...
##  $ city_id                     : num  10001 10001 10001 10001 10001 ...
##  $ city                        : Factor w/ 4 levels "Chicago","New York",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ country                     : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ created.y                   : Date, format: "0008-10-20" "0020-12-20" ...
##  $ group_photo.base_url        : Factor w/ 2 levels "https://secure.meetupstatic.com",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ group_photo.highres_link    : Factor w/ 267 levels "https://secure.meetupstatic.com/photos/event/1/0/d/6/highres_456544310.jpeg",..: 258 161 161 161 161 161 161 161 161 161 ...
##  $ group_photo.photo_id        : num  1.62e+06 1.14e+08 1.14e+08 1.14e+08 1.14e+08 ...
##  $ group_photo.photo_link      : Factor w/ 267 levels "https://secure.meetupstatic.com/photos/event/1/0/d/6/600_456544310.jpeg",..: 258 161 161 161 161 161 161 161 161 161 ...
##  $ group_photo.thumb_link      : Factor w/ 267 levels "https://secure.meetupstatic.com/photos/event/1/0/d/6/thumb_456544310.jpeg",..: 258 161 161 161 161 161 161 161 161 161 ...
##  $ group_photo.type            : Factor w/ 2 levels "event","others": 1 1 1 1 1 1 1 1 1 1 ...
##  $ join_mode                   : Factor w/ 2 levels "approval","open": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lat                         : num  40.8 40.8 40.8 40.8 40.8 ...
##  $ link                        : Factor w/ 341 levels "https://www.meetup.com/1001-Books-SF/",..: 38 315 315 315 315 315 315 315 315 315 ...
##  $ lon                         : num  -74 -74 -74 -74 -74 ...
##  $ members                     : num  347 1426 1426 1426 1426 ...
##  $ group_name                  : Factor w/ 341 levels "1001 Books You Must Read Before You Die - San Francisco",..: 307 279 279 279 279 279 279 279 279 279 ...
##  $ organizer.member_id         : num  3198499 82494832 82494832 82494832 82494832 ...
##  $ organizer.name              : Factor w/ 278 levels "Admin Jon","AIASF",..: 88 272 272 272 272 272 272 272 272 272 ...
##  $ organizer.photo.base_url    : Factor w/ 2 levels "https://secure.meetupstatic.com",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ organizer.photo.highres_link: Factor w/ 268 levels "https://secure.meetupstatic.com/photos/member/1/0/0/3/highres_261964099.jpeg",..: 83 191 191 191 191 191 191 191 191 191 ...
##  $ organizer.photo.photo_id    : num  3.64e+07 1.01e+08 1.01e+08 1.01e+08 1.01e+08 ...
##  $ organizer.photo.photo_link  : Factor w/ 306 levels "https://secure.meetupstatic.com/photos/member/1/0/0/3/member_261964099.jpeg",..: 88 218 218 218 218 218 218 218 218 218 ...
##  $ organizer.photo.thumb_link  : Factor w/ 306 levels "https://secure.meetupstatic.com/photos/member/1/0/0/3/thumb_261964099.jpeg",..: 88 218 218 218 218 218 218 218 218 218 ...
##  $ organizer.photo.type        : Factor w/ 2 levels "member","others": 1 1 1 1 1 1 1 1 1 1 ...
##  $ rating                      : num  3.98 4.33 4.33 4.33 4.33 4.33 4.33 4.33 4.33 4.33 ...
##  $ state                       : Factor w/ 3 levels "CA","IL","NY": 3 3 3 3 3 3 3 3 3 3 ...
##  $ timezone                    : Factor w/ 3 levels "US/Central","US/Eastern",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ urlname                     : Factor w/ 341 levels "1001-Books-SF",..: 38 315 315 315 315 315 315 315 315 315 ...
##  $ utc_offset.y                : num  -14400 -14400 -14400 -14400 -14400 -14400 -14400 -14400 -14400 -14400 ...
##  $ visibility.y                : Factor w/ 1 level "public": 1 1 1 1 1 1 1 1 1 1 ...
##  $ who                         : Factor w/ 202 levels "$Million Dollar Leaders",..: 29 180 180 180 180 180 180 180 180 180 ...
  • Funtion to Remove redundant columns using identical funtion to check similar columns named differently.
test_function <- function(col1, col2){
  return( identical(df[[col1]],df[[col2]]))
}

for(col1 in names(df)){
  for(col2 in names(df)){
    if(test_function(col1,col2) && col1 != col2)
      print(paste(col1, col2, test_function(col1,col2)))
  }
}
## [1] "group.created created.y TRUE"
## [1] "group.group_lat lat TRUE"
## [1] "group.group_lon lon TRUE"
## [1] "group.join_mode join_mode TRUE"
## [1] "group.urlname urlname TRUE"
## [1] "group.who who TRUE"
## [1] "headcount maybe_rsvp_count TRUE"
## [1] "headcount rating.average TRUE"
## [1] "maybe_rsvp_count headcount TRUE"
## [1] "maybe_rsvp_count rating.average TRUE"
## [1] "photo_url why TRUE"
## [1] "rating.average headcount TRUE"
## [1] "rating.average maybe_rsvp_count TRUE"
## [1] "venue.country country TRUE"
## [1] "visibility.x visibility.y TRUE"
## [1] "why photo_url TRUE"
## [1] "country venue.country TRUE"
## [1] "created.y group.created TRUE"
## [1] "join_mode group.join_mode TRUE"
## [1] "lat group.group_lat TRUE"
## [1] "lon group.group_lon TRUE"
## [1] "urlname group.urlname TRUE"
## [1] "visibility.y visibility.x TRUE"
## [1] "who group.who TRUE"
  • Removing redundant variables from dataframe
df <- subset(df, select = -c(group.created,
                             group.group_lat,
                             group.group_lon,
                             group.join_mode,
                             group.urlname,
                             group.who,
                             headcount,
                             photo_url,
                             venue.country,
                             visibility.x))

Tidy & Manipulate Data I

head(df,10)
  1. All observations are in single in row.
  2. All columns contains single variable.
  3. Each cell has one value.

Tidy & Manipulate Data II

Duration column which is in seconds is converted to [hours minutes seconds] and mutated into new column called period

df <- mutate(df, period = seconds_to_period(duration))
head(df$period)
## [1] "2H 0M 0S" "3H 0M 0S" "3H 0M 0S" "3H 0M 0S" "3H 0M 0S" "3H 0M 0S"

Scan I

test <- data_frame(colnames(df),sapply(df, function(x) sum(is.na(x))))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
head(test
     )
a <- (df$fee.amount >= 0)
b <- (df$city_id >= 0)
c <- (df$members >= 0)
d <- (df$group_id >= 0)

summary(a)
##    Mode    TRUE 
## logical    5807
summary(b)
##    Mode    TRUE 
## logical    5807
summary(c)
##    Mode    TRUE 
## logical    5807
summary(d)
##    Mode    TRUE 
## logical    5807

Scan II

Plotting Box plot for all the numerical variables to see possibility of outliers

some columns are numeric but non linear which makes no sense to be plotted as a boxplot such as multiple ID??s

numeric_columns <- names(df %>% select_if(is.numeric))
numeric_columns 
##  [1] "group_id"                 "duration"                
##  [3] "fee.amount"               "maybe_rsvp_count"        
##  [5] "rating.average"           "rating.count"            
##  [7] "rsvp_limit"               "utc_offset.x"            
##  [9] "venue_id"                 "venue.lat"               
## [11] "venue.lon"                "venue.phone"             
## [13] "waitlist_count"           "yes_rsvp_count"          
## [15] "category_id"              "city_id"                 
## [17] "group_photo.photo_id"     "lat"                     
## [19] "lon"                      "members"                 
## [21] "organizer.member_id"      "organizer.photo.photo_id"
## [23] "rating"                   "utc_offset.y"            
## [25] "period"
non_linear<-c( "group_id" ,"utc_offset.x" ,"venue_id" ,"venue.lat","venue.lon" ,"venue.phone", "category_id", "city_id","group_photo.photo_id","lat", "lon" ,"organizer.member_id"  , "organizer.photo.photo_id","utc_offset.y"      )


non_useful<-c("rating.average"  , "rating.count" ,"maybe_rsvp_count", "waitlist_count", "period" ,"duration"  ,     "fee.amount"   ,  "rsvp_limit"   ,  "yes_rsvp_count" )
x <- unlist(strsplit(numeric_columns, " "))

numeric_columns <- x[!x %in% non_linear]
numeric_columns 
##  [1] "duration"         "fee.amount"       "maybe_rsvp_count"
##  [4] "rating.average"   "rating.count"     "rsvp_limit"      
##  [7] "waitlist_count"   "yes_rsvp_count"   "members"         
## [10] "rating"           "period"
x <- unlist(strsplit(numeric_columns, " "))
numeric_columns <- x[!x %in% non_useful]

par(mfcol = c(2, 1), mar=c(1,1,1,1))
for(col in numeric_columns)
  boxplot(df[col], main = col)

cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
}

-Applying the function for Capping/replacing outliers with nearest quantile(0.05 or 0.95) for all numerical variables

df1<-df
df1$members <- df$members %>% cap()
df1$rating <- df$rating %>%  cap()
par(mfcol = c(1, 2), mar=c(1,1,1,1))
for(col in numeric_columns)
  boxplot(df[col], main = col)

Outliers werent capped due to their values being extremes! One possible solution is to try to cap the values after transforming them

df1$members <- df$members %>% log() %>% cap()
df1$rating <- df$rating %>% log10() %>%  cap()
par(mfcol = c(1, 2), mar=c(1,1,1,1))
for(col in numeric_columns)
  boxplot(df1[col], main = col)
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out =
## z$out[z$group == : Outlier (-Inf) in boxplot 1 is not drawn

Outliers are removed successfully!

Transform

par(mfcol = c(1,1), mar=c(2,2,2,2))
hist(df$members)

summary(df$members)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4     259     575    1829    1660   20497
par(mfcol = c(1,1), mar=c(1,1,1,1))

log10_Transformation <- log10(df$members)
hist(log10_Transformation)

log_Transformation <- log(df$members)
hist(log_Transformation)

sqrt_Transformation <- sqrt(df$members)
hist(sqrt_Transformation)

power_Transformation <- 1/df$members
hist(power_Transformation)

boxcox_Transformation<- BoxCox(df$members,lambda = "auto")
hist(boxcox_Transformation)