# Import CSV file
movies <- read.csv("C:/Users/palla/Downloads/movies - movies.csv")
# View dataset in table form
View(movies)
# Show column names
names(movies)
## [1] "Rank" "Movie" "Release_Date" "Distributor" "Genre"
## [6] "MPAA" "Gross_Sales" "Tickets_Sold"
# Show rows and columns count
dim(movies)
## [1] 50 8
# Display first rows of dataset
head(movies)
## Rank Movie Release_Date Distributor Genre MPAA
## 1 1 The Lego Movie 2/7/14 Warner Bros. Adventure PG
## 2 2 Ride Along 1/17/14 Universal Comedy PG-13
## 3 3 Lone Survivor 1/10/14 Universal Action R
## 4 4 Frozen 11/27/13 Walt Disney Adventure PG
## 5 5 300: Rise of an Empire 3/7/14 Warner Bros. Action R
## 6 6 Divergent 3/21/14 Lionsgate Adventure PG-13
## Gross_Sales Tickets_Sold
## 1 248303720 30429377
## 2 133659265 16379811
## 3 124722648 15284638
## 4 121285671 14863440
## 5 101145414 12395271
## 6 95260008 11674020
# Show structure of dataset
str(movies)
## 'data.frame': 50 obs. of 8 variables:
## $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Movie : chr "The Lego Movie" "Ride Along" "Lone Survivor" "Frozen" ...
## $ Release_Date: chr "2/7/14" "1/17/14" "1/10/14" "11/27/13" ...
## $ Distributor : chr "Warner Bros." "Universal" "Universal" "Walt Disney" ...
## $ Genre : chr "Adventure" "Comedy" "Action" "Adventure" ...
## $ MPAA : chr "PG" "PG-13" "R" "PG" ...
## $ Gross_Sales : int 248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
## $ Tickets_Sold: int 30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
# Show statistical summary
summary(movies)
## Rank Movie Release_Date Distributor
## Min. : 1.00 Length:50 Length:50 Length:50
## 1st Qu.:13.25 Class :character Class :character Class :character
## Median :25.50 Mode :character Mode :character Mode :character
## Mean :25.50
## 3rd Qu.:37.75
## Max. :50.00
## Genre MPAA Gross_Sales Tickets_Sold
## Length:50 Length:50 Min. : 5330000 Min. : 653186
## Class :character Class :character 1st Qu.: 18093441 1st Qu.: 2217333
## Mode :character Mode :character Median : 27647462 Median : 3388169
## Mean : 44609877 Mean : 5466896
## 3rd Qu.: 57711063 3rd Qu.: 7072434
## Max. :248303720 Max. :30429377
# Create scatter plot
plot(movies$Tickets_Sold, movies$Gross_Sales)

# Store columns into variables
h <- movies$Tickets_Sold
d <- movies$Gross_Sales
# Create regression model
model <- lm(d ~ h, data = movies)
# Plot tickets vs gross sales
plot(movies$Tickets_Sold, movies$Gross_Sales)

# Create scatter plot first
plot(movies$Tickets_Sold, movies$Gross_Sales)
# Add regression line
abline(model)

# Plot scaled values
plot(movies$Tickets_Sold/1000, movies$Gross_Sales/1000)

# Plot scaled values
plot(movies$Tickets_Sold/100000, movies$Gross_Sales/100000)

# Plot scaled values
plot(movies$Tickets_Sold/1000000, movies$Gross_Sales/1000000)

# Calculate correlation
cor(movies$Tickets_Sold, movies$Gross_Sales)
## [1] 1
# Convert values to millions
movies$Tickets_SoldMillions <- movies$Tickets_Sold/1000000
movies$Gross_SalesMillions <- movies$Gross_Sales/1000000
# Store gross sales in millions
m <- movies$Gross_Sales/1000000
# Scatter plot with labels
plot(movies$Tickets_SoldMillions, movies$Gross_SalesMillions, main="Tickets Sold vs Gross Sales", xlab="Tickets Sold (Millions)", ylab="Gross Sales (Millions USD)", frame.plot=TRUE, col="#4cbea3")

# Scatter plot with blue color
plot(movies$Tickets_SoldMillions, movies$Gross_SalesMillions, main="Tickets Sold vs Gross Sales", xlab="Tickets Sold (Millions)", ylab="Gross Sales (Millions USD)", frame.plot=TRUE, col="blue")

# Create vertical boxplot
boxplot(m, horizontal=FALSE, main="Top 50 Grossing Films", frame.plot=TRUE, col="red")

# Create horizontal boxplot
boxplot(movies$Gross_SalesMillions, horizontal=TRUE, main="Top 50 Grossing Films", frame.plot=TRUE, col="green")

# Convert genre to factor
movies$genrefactor <- as.factor(movies$Genre)
# Histogram of genres
hist(as.numeric(movies$genrefactor), main="Frequency of Films by Genre", xlab="Genre Category", ylab="Frequency", col="#4cbea3")

# Histogram of gross sales
hist(movies$Gross_SalesMillions, main="Frequency of Movies by Gross Sales", xlab="Gross Sales", ylab="Frequency", col="#4cbea3", border="#FFFFFF")

# Import GDP dataset
gdp <- read.csv("C:/Users/palla/Downloads/gdpcsv - gdpcsv.csv")
# View GDP dataset
View(gdp)
# Show GDP column names
names(gdp)
## [1] "Country.Name" "Country.Code" "X1960" "X1961" "X1962"
## [6] "X1963" "X1964" "X1965" "X1966" "X1967"
## [11] "X1968" "X1969" "X1970" "X1971" "X1972"
## [16] "X1973" "X1974" "X1975" "X1976" "X1977"
## [21] "X1978" "X1979" "X1980" "X1981" "X1982"
## [26] "X1983" "X1984" "X1985" "X1986" "X1987"
## [31] "X1988" "X1989" "X1990" "X1991" "X1992"
## [36] "X1993" "X1994" "X1995" "X1996" "X1997"
## [41] "X1998" "X1999" "X2000" "X2001" "X2002"
## [46] "X2003" "X2004" "X2005" "X2006" "X2007"
## [51] "X2008" "X2009" "X2010" "X2011" "X2012"
## [56] "X2013" "X2014" "X2015" "X2016" "X2017"
# Show GDP dimensions
dim(gdp)
## [1] 264 60
# Import life expectancy dataset
life_expectancy <- read.csv("C:/Users/palla/Downloads/life_expectancy - life_expectancy.csv")
# View life expectancy data
View(life_expectancy)
# Show column names
names(life_expectancy)
## [1] "Country.Name" "Country.Code" "X1960" "X1961" "X1962"
## [6] "X1963" "X1964" "X1965" "X1966" "X1967"
## [11] "X1968" "X1969" "X1970" "X1971" "X1972"
## [16] "X1973" "X1974" "X1975" "X1976" "X1977"
## [21] "X1978" "X1979" "X1980" "X1981" "X1982"
## [26] "X1983" "X1984" "X1985" "X1986" "X1987"
## [31] "X1988" "X1989" "X1990" "X1991" "X1992"
## [36] "X1993" "X1994" "X1995" "X1996" "X1997"
## [41] "X1998" "X1999" "X2000" "X2001" "X2002"
## [46] "X2003" "X2004" "X2005" "X2006" "X2007"
## [51] "X2008" "X2009" "X2010" "X2011" "X2012"
## [56] "X2013" "X2014" "X2015" "X2016" "X2017"
# Show dataset size
dim(life_expectancy)
## [1] 264 60
# Extract GDP 2016 data
gdp2016 <- gdp$X2016
# View GDP 2016 values
View(gdp2016)
# Extract life expectancy 2016
le2016 <- life_expectancy$X2016
# Scatter plot GDP vs life expectancy
plot(gdp2016/1000, le2016/1000, main="GDP vs Life Expectancy 2016", xlab="GDP 2016", ylab="Life Expectancy 2016", col="red")

# Histogram of GDP values
hist(gdp2016, main="GDP Values", xlab="GDP 2016", ylab="Frequency", col="#3FFFFF")

# Boxplot for GDP
boxplot(gdp2016, horizontal=FALSE, main="GDP 2016", frame.plot=TRUE, col="red")

#Boxplot of gdp,Life Expectancy
boxplot(gdp2016,horizontal =FALSE, main="GDP 2016",frame.plot=TRUE,col="RED")

boxplot(le2016,horizontal =FALSE, main="Life Expectancy 2016",frame.plot=TRUE,col="yellow")
