As part of 607 project2, I have selected below 3 datasets provided in my class discussions. 1. SnP 500 Financial Data 2. Employee Review on companies (Facebook, Google, Netflix, Amazon, Microsoft, Apple) 3. Google play dataset
List of companies in the S&P 500 (Standard and Poor’s 500). The S&P 500 is a free-float, capitalization-weighted index of the top 500 publicly listed stocks in the US (top 500 by market cap).
snp_data <- read.csv("https://raw.githubusercontent.com/san123i/CUNY/master/Semester1/607/Week5/project2/SnP500-financials.csv", header = T, sep = ",", stringsAsFactors = F, strip.white = T)
head(snp_data)
## Symbol Name Sector Price Price.Earnings
## 1 MMM 3M Company Industrials 222.89 24.31
## 2 AOS A.O. Smith Corp Industrials 60.24 27.76
## 3 ABT Abbott Laboratories Health Care 56.27 22.51
## 4 ABBV AbbVie Inc. Health Care 108.48 19.41
## 5 ACN Accenture plc Information Technology 150.51 25.47
## 6 ATVI Activision Blizzard Information Technology 65.83 31.80
## Dividend.Yield Earnings.Share X52.Week.Low X52.Week.High Market.Cap
## 1 2.3328617 7.92 259.770 175.490 138721000000
## 2 1.1479592 1.70 68.390 48.925 10783419933
## 3 1.9089824 0.26 64.600 42.280 102121000000
## 4 2.4995599 3.29 125.860 60.050 181386000000
## 5 1.7144699 5.44 162.600 114.820 98765855553
## 6 0.4319032 1.28 74.945 38.930 52518668144
## Price.Sales Price.Book
## 1 4.390271 11.34
## 2 3.575483 6.35
## 3 3.740480 3.19
## 4 6.291571 26.14
## 5 2.604117 10.62
## 6 10.595120 5.16
snp_longdata <- gather(data = snp_data, key = "Attribute", value="values",3:12)
head(snp_longdata)
## Symbol Name Attribute values
## 1 MMM 3M Company Sector Industrials
## 2 AOS A.O. Smith Corp Sector Industrials
## 3 ABT Abbott Laboratories Sector Health Care
## 4 ABBV AbbVie Inc. Sector Health Care
## 5 ACN Accenture plc Sector Information Technology
## 6 ATVI Activision Blizzard Sector Information Technology
snp_sector_data <- snp_longdata %>% filter(Attribute == 'Sector')
head(snp_sector_data)
## Symbol Name Attribute values
## 1 MMM 3M Company Sector Industrials
## 2 AOS A.O. Smith Corp Sector Industrials
## 3 ABT Abbott Laboratories Sector Health Care
## 4 ABBV AbbVie Inc. Sector Health Care
## 5 ACN Accenture plc Sector Information Technology
## 6 ATVI Activision Blizzard Sector Information Technology
#Calculate the stock fluctuations (for 52 week values) as a percentage
snp_data <- mutate(snp_data, "fluct_perc" = abs(100*((X52.Week.High - X52.Week.Low)/X52.Week.Low)) )
#Calculate the highest PE sectors
pe_data <- select(snp_data, Sector, Price.Earnings) %>% group_by(Sector) %>% summarise(meanPE = mean(Price.Earnings, na.rm = T))
#Calculate the highest dividend yield sectors
yield_data <- select(snp_data, Sector, Dividend.Yield) %>% group_by(Sector) %>% summarise(meanDividendYield = mean(Dividend.Yield, na.rm = T))
#Calculate the premium stocks in each sector
premium_stocks <- select(snp_data, Sector, Name, Price.Earnings) %>% group_by(Sector) %>% filter(Price.Earnings==max(Price.Earnings, na.rm = T))
#Number of companies by sector in snp 500
ggplot(snp_sector_data, aes(x=values)) + geom_bar() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
#Sectors which experienced most fluctuations
ggplot(arrange(snp_data, desc(fluct_perc)) %>% top_n(100), aes(x=Sector, y=fluct_perc)) + geom_col() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + geom_text(aes(label=round(fluct_perc,2)), vjust=1.6, color="white", size=3.5)
## Selecting by fluct_perc
#Highest PE sectors
ggplot(pe_data, aes(x=Sector, y=meanPE)) + geom_col() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + geom_text(aes(label=round(meanPE,2)), vjust=1.6, color="white", size=3.5)
#Highest dividend yield sectors
ggplot(yield_data, aes(x=Sector, y=meanDividendYield)) + geom_col() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+ geom_text(aes(label=round(meanDividendYield,2)), vjust=1.6, color="white", size=3.5)
#Print the premium stocks in each sector
ggplot(premium_stocks, aes(x=Name, y=Price.Earnings, fill=Sector)) + geom_col() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + geom_text(aes(label=round(Price.Earnings,2)), vjust=1.6, color="white", size=3.5)
1. Maximum number of companies in the SnP 500 are from 'consumer discretionary' sector
2. 'Consumer discretionary' sector is the one which experienced the most fluctuations in its stock prices
3. 'Telecommunication services' sector is the sector which experienced the least fluctuations in stock prices
4. 'Information Technology' and 'Energy' sector companies have the highest PE(price/Earning) ratio amongst all sectors
5. 'Telecommunication Services' sector offers the highest divident yield amongst all the sectors and 'Healthcare' offers the least
This data set contains reviews collected from Glassdoor. It has reviews belonging to former and current employees of top 5 tech companies i.e, Google, Facebook, Microsoft, Apple, Amazon and Netflix. The data set provides the ratings submitted on different counts such as work balance, compensation etc.
empFeedback_data<- read.csv("https://raw.githubusercontent.com/san123i/CUNY/master/Semester1/607/Week5/project2/employee_reviews_modified.csv", header = T, sep = ",", stringsAsFactors = F, strip.white = T)
head(empFeedback_data)
## X company location dates
## 1 1 google none Dec 11, 2018
## 2 2 google Mountain View, CA Jun 21, 2013
## 3 3 google New York, NY May 10, 2014
## 4 4 google Mountain View, CA Feb 8, 2015
## 5 5 google Los Angeles, CA Jul 19, 2018
## 6 6 google Mountain View, CA Dec 9, 2018
## job.title overall.ratings
## 1 Current Employee - Anonymous Employee 5
## 2 Former Employee - Program Manager 4
## 3 Current Employee - Software Engineer III 5
## 4 Current Employee - Anonymous Employee 5
## 5 Former Employee - Software Engineer 5
## 6 Former Employee - SDE2 5
## work.balance.stars culture.values.stars carrer.opportunities.stars
## 1 4 5 5
## 2 2 3 3
## 3 5 4 5
## 4 2 5 5
## 5 5 5 5
## 6 4 4 4
## comp.benefit.stars senior.mangemnet.stars helpful.count
## 1 4 5 0
## 2 5 3 2094
## 3 5 4 949
## 4 4 5 498
## 5 5 5 49
## 6 5 4 1
empFeedback_long <- gather(empFeedback_data, key="Ratings", value="Rated", 6:12)
head(empFeedback_long)
## X company location dates
## 1 1 google none Dec 11, 2018
## 2 2 google Mountain View, CA Jun 21, 2013
## 3 3 google New York, NY May 10, 2014
## 4 4 google Mountain View, CA Feb 8, 2015
## 5 5 google Los Angeles, CA Jul 19, 2018
## 6 6 google Mountain View, CA Dec 9, 2018
## job.title Ratings Rated
## 1 Current Employee - Anonymous Employee overall.ratings 5
## 2 Former Employee - Program Manager overall.ratings 4
## 3 Current Employee - Software Engineer III overall.ratings 5
## 4 Current Employee - Anonymous Employee overall.ratings 5
## 5 Former Employee - Software Engineer overall.ratings 5
## 6 Former Employee - SDE2 overall.ratings 5
#Calculate the overall mean
overall.mean <- select(empFeedback_long, company, Ratings, Rated)%>% subset(Ratings == "overall.ratings") %>% subset(!is.na(Rated)) %>% group_by(company) %>% summarise(overall.mean = mean(as.numeric(Rated), na.rm = T))
#Calculate the mean of workbalance
workbalance.mean <- select(empFeedback_long, company, Ratings, Rated)%>% subset(Ratings == 'work.balance.stars') %>% subset(!is.na(Rated)) %>% group_by(company) %>% summarise(workbalance.mean = mean(as.numeric(Rated), na.rm = T))
#Calculate the mean of the cultural values
culturevalues.mean <- select(empFeedback_long, company, Ratings, Rated)%>% subset(Ratings == 'culture.values.stars') %>% subset(!is.na(Rated)) %>% group_by(company) %>% summarise(culturevalues.mean = mean(as.numeric(Rated), na.rm = T))
#Calculate the mean of career opportunities
carreropportunities.mean <- select(empFeedback_long, company, Ratings, Rated)%>% subset(Ratings == 'carrer.opportunities.stars') %>% subset(!is.na(Rated)) %>% group_by(company) %>% summarise(carreropportunities.mean = mean(as.numeric(Rated), na.rm = T))
#Calculate the mean of compenstation benefits review
compbenefit.mean <- select(empFeedback_long, company, Ratings, Rated)%>% subset(Ratings == 'comp.benefit.stars') %>% subset(!is.na(Rated)) %>% group_by(company) %>% summarise(compbenefit.mean = mean(as.numeric(Rated), na.rm = T))
#Calculate the mean of senior management reviews
seniormanagement.mean <- select(empFeedback_long, company, Ratings, Rated)%>% subset(Ratings == 'senior.mangemnet.stars') %>% subset(!is.na(Rated)) %>% group_by(company) %>% summarise(seniormangemnet.mean = mean(as.numeric(Rated), na.rm = T))
#Join all the mean of all different reviews
ratings_table <- inner_join(overall.mean, workbalance.mean, "company") %>% inner_join(culturevalues.mean, "company") %>% inner_join(carreropportunities.mean, "company") %>% inner_join(compbenefit.mean, "company") %>% inner_join(seniormanagement.mean, "company")
#Convert the wide ratings table into a long table
ratings_long <- gather(ratings_table, key="rating.type",value="rating.mean", 2:7)
ggplot(ratings_long, aes(x=company, y=rating.mean, fill=rating.type)) + geom_bar(stat="identity", position = position_dodge()) + geom_text(aes(label=round(rating.mean,2)), vjust=1.6, color="white", size=3.5)
ggplot(ratings_table, aes(x=company, y=overall.mean)) + geom_col(position = position_dodge()) + geom_text(aes(label=round(overall.mean,2)), vjust=1.6, color="white", size=3.5)
ggplot(ratings_table, aes(x=company, y=workbalance.mean)) + geom_col(position = position_dodge()) + geom_text(aes(label=round(workbalance.mean,2)), vjust=1.6, color="white", size=3.5)
This data set contains about 10K different apps related info collected from Google play store. It provides info about the apps size, category, updated date, and the reviews it received from its users.
play <- read.csv("https://raw.githubusercontent.com/san123i/CUNY/master/Semester1/607/Week5/project2/googleplaystore.csv")
head(play)
## App Category
## 1 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN
## 2 Coloring book moana ART_AND_DESIGN
## 3 U Launcher Lite â\200“ FREE Live Cool Themes, Hide Apps ART_AND_DESIGN
## 4 Sketch - Draw & Paint ART_AND_DESIGN
## 5 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN
## 6 Paper flowers instructions ART_AND_DESIGN
## Rating Reviews Size Installs Type Price Content.Rating
## 1 4.1 159 19M 10,000+ Free 0 Everyone
## 2 3.9 967 14M 500,000+ Free 0 Everyone
## 3 4.7 87510 8.7M 5,000,000+ Free 0 Everyone
## 4 4.5 215644 25M 50,000,000+ Free 0 Teen
## 5 4.3 967 2.8M 100,000+ Free 0 Everyone
## 6 4.4 167 5.6M 50,000+ Free 0 Everyone
## Genres Last.Updated Current.Ver
## 1 Art & Design January 7, 2018 1.0.0
## 2 Art & Design;Pretend Play January 15, 2018 2.0.0
## 3 Art & Design August 1, 2018 1.2.4
## 4 Art & Design June 8, 2018 Varies with device
## 5 Art & Design;Creativity June 20, 2018 1.1
## 6 Art & Design March 26, 2017 1.0
## Android.Ver
## 1 4.0.3 and up
## 2 4.0.3 and up
## 3 4.0.3 and up
## 4 4.2 and up
## 5 4.4 and up
## 6 2.3 and up
cleansedData <- play %>% filter(!is.na(Rating), !Rating<0, !Rating>5) %>% unique()
#Filtering removed almost 500 records which are duplicate or dirty
ggplot(cleansedData, aes(x=Rating, y=as.numeric(Installs))) + geom_bar(stat = "identity") + labs(y = "Install count") + geom_text(aes(label=round(as.numeric(Installs),2)), vjust=1.6, color="white", size=3.5)
ggplot(cleansedData, aes(x=Rating, y=as.numeric(Reviews))) + geom_bar(stat = "identity") + labs(y = "Review count") + geom_text(aes(label=round(as.numeric(Reviews),2)), vjust=1.6, color="white", size=3.5)
ggplot(cleansedData, aes(x=Rating, y=as.numeric(Reviews))) + geom_bar(stat = "identity") + facet_wrap( ~ Category, ncol = 4) + geom_text(aes(label=round(as.numeric(Reviews),2)), vjust=1.6, color="white", size=3.5)