# Import CSV file
movies <- read.csv("C:/Users/palla/Downloads/movies - movies.csv")
# View dataset in table form
View(movies)
# Show column names
names(movies)
## [1] "Rank"         "Movie"        "Release_Date" "Distributor"  "Genre"       
## [6] "MPAA"         "Gross_Sales"  "Tickets_Sold"
# Show rows and columns count
dim(movies)
## [1] 50  8
# Display first rows of dataset
head(movies)
##   Rank                  Movie Release_Date  Distributor     Genre  MPAA
## 1    1         The Lego Movie       2/7/14 Warner Bros. Adventure    PG
## 2    2             Ride Along      1/17/14    Universal    Comedy PG-13
## 3    3          Lone Survivor      1/10/14    Universal    Action     R
## 4    4                 Frozen     11/27/13  Walt Disney Adventure    PG
## 5    5 300: Rise of an Empire       3/7/14 Warner Bros.    Action     R
## 6    6              Divergent      3/21/14    Lionsgate Adventure PG-13
##   Gross_Sales Tickets_Sold
## 1   248303720     30429377
## 2   133659265     16379811
## 3   124722648     15284638
## 4   121285671     14863440
## 5   101145414     12395271
## 6    95260008     11674020
# Show structure of dataset
str(movies)
## 'data.frame':    50 obs. of  8 variables:
##  $ Rank        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Movie       : chr  "The Lego Movie" "Ride Along" "Lone Survivor" "Frozen" ...
##  $ Release_Date: chr  "2/7/14" "1/17/14" "1/10/14" "11/27/13" ...
##  $ Distributor : chr  "Warner Bros." "Universal" "Universal" "Walt Disney" ...
##  $ Genre       : chr  "Adventure" "Comedy" "Action" "Adventure" ...
##  $ MPAA        : chr  "PG" "PG-13" "R" "PG" ...
##  $ Gross_Sales : int  248303720 133659265 124722648 121285671 101145414 95260008 94479448 85091060 76599461 74500902 ...
##  $ Tickets_Sold: int  30429377 16379811 15284638 14863440 12395271 11674020 11578363 10427825 9387188 9130012 ...
# Show statistical summary
summary(movies)
##       Rank          Movie           Release_Date       Distributor       
##  Min.   : 1.00   Length:50          Length:50          Length:50         
##  1st Qu.:13.25   Class :character   Class :character   Class :character  
##  Median :25.50   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :25.50                                                           
##  3rd Qu.:37.75                                                           
##  Max.   :50.00                                                           
##     Genre               MPAA            Gross_Sales         Tickets_Sold     
##  Length:50          Length:50          Min.   :  5330000   Min.   :  653186  
##  Class :character   Class :character   1st Qu.: 18093441   1st Qu.: 2217333  
##  Mode  :character   Mode  :character   Median : 27647462   Median : 3388169  
##                                        Mean   : 44609877   Mean   : 5466896  
##                                        3rd Qu.: 57711063   3rd Qu.: 7072434  
##                                        Max.   :248303720   Max.   :30429377
# Create scatter plot
plot(movies$Tickets_Sold, movies$Gross_Sales)

# Store columns into variables
h <- movies$Tickets_Sold
d <- movies$Gross_Sales
# Create regression model
model <- lm(d ~ h, data = movies)
# Plot tickets vs gross sales
plot(movies$Tickets_Sold, movies$Gross_Sales)

# Create scatter plot first
plot(movies$Tickets_Sold, movies$Gross_Sales)

# Add regression line
abline(model)

# Plot scaled values
plot(movies$Tickets_Sold/1000, movies$Gross_Sales/1000)

# Plot scaled values
plot(movies$Tickets_Sold/100000, movies$Gross_Sales/100000)

# Plot scaled values
plot(movies$Tickets_Sold/1000000, movies$Gross_Sales/1000000)

# Calculate correlation
cor(movies$Tickets_Sold, movies$Gross_Sales)
## [1] 1
# Convert values to millions
movies$Tickets_SoldMillions <- movies$Tickets_Sold/1000000
movies$Gross_SalesMillions <- movies$Gross_Sales/1000000
# Store gross sales in millions
m <- movies$Gross_Sales/1000000
# Scatter plot with labels
plot(movies$Tickets_SoldMillions, movies$Gross_SalesMillions, main="Tickets Sold vs Gross Sales", xlab="Tickets Sold (Millions)", ylab="Gross Sales (Millions USD)", frame.plot=TRUE, col="#4cbea3")

# Scatter plot with blue color
plot(movies$Tickets_SoldMillions, movies$Gross_SalesMillions, main="Tickets Sold vs Gross Sales", xlab="Tickets Sold (Millions)", ylab="Gross Sales (Millions USD)", frame.plot=TRUE, col="blue")

# Create vertical boxplot
boxplot(m, horizontal=FALSE, main="Top 50 Grossing Films", frame.plot=TRUE, col="red")

# Create horizontal boxplot
boxplot(movies$Gross_SalesMillions, horizontal=TRUE, main="Top 50 Grossing Films", frame.plot=TRUE, col="green")

# Convert genre to factor
movies$genrefactor <- as.factor(movies$Genre)
# Histogram of genres
hist(as.numeric(movies$genrefactor), main="Frequency of Films by Genre", xlab="Genre Category", ylab="Frequency", col="#4cbea3")

# Histogram of gross sales
hist(movies$Gross_SalesMillions, main="Frequency of Movies by Gross Sales", xlab="Gross Sales", ylab="Frequency", col="#4cbea3", border="#FFFFFF")

# Import GDP dataset
gdp <- read.csv("C:/Users/palla/Downloads/gdpcsv - gdpcsv.csv")
# View GDP dataset
View(gdp)
# Show GDP column names
names(gdp)
##  [1] "Country.Name" "Country.Code" "X1960"        "X1961"        "X1962"       
##  [6] "X1963"        "X1964"        "X1965"        "X1966"        "X1967"       
## [11] "X1968"        "X1969"        "X1970"        "X1971"        "X1972"       
## [16] "X1973"        "X1974"        "X1975"        "X1976"        "X1977"       
## [21] "X1978"        "X1979"        "X1980"        "X1981"        "X1982"       
## [26] "X1983"        "X1984"        "X1985"        "X1986"        "X1987"       
## [31] "X1988"        "X1989"        "X1990"        "X1991"        "X1992"       
## [36] "X1993"        "X1994"        "X1995"        "X1996"        "X1997"       
## [41] "X1998"        "X1999"        "X2000"        "X2001"        "X2002"       
## [46] "X2003"        "X2004"        "X2005"        "X2006"        "X2007"       
## [51] "X2008"        "X2009"        "X2010"        "X2011"        "X2012"       
## [56] "X2013"        "X2014"        "X2015"        "X2016"        "X2017"
# Show GDP dimensions
dim(gdp)
## [1] 264  60
# Import life expectancy dataset
life_expectancy <- read.csv("C:/Users/palla/Downloads/life_expectancy - life_expectancy.csv")
# View life expectancy data
View(life_expectancy)
# Show column names
names(life_expectancy)
##  [1] "Country.Name" "Country.Code" "X1960"        "X1961"        "X1962"       
##  [6] "X1963"        "X1964"        "X1965"        "X1966"        "X1967"       
## [11] "X1968"        "X1969"        "X1970"        "X1971"        "X1972"       
## [16] "X1973"        "X1974"        "X1975"        "X1976"        "X1977"       
## [21] "X1978"        "X1979"        "X1980"        "X1981"        "X1982"       
## [26] "X1983"        "X1984"        "X1985"        "X1986"        "X1987"       
## [31] "X1988"        "X1989"        "X1990"        "X1991"        "X1992"       
## [36] "X1993"        "X1994"        "X1995"        "X1996"        "X1997"       
## [41] "X1998"        "X1999"        "X2000"        "X2001"        "X2002"       
## [46] "X2003"        "X2004"        "X2005"        "X2006"        "X2007"       
## [51] "X2008"        "X2009"        "X2010"        "X2011"        "X2012"       
## [56] "X2013"        "X2014"        "X2015"        "X2016"        "X2017"
# Show dataset size
dim(life_expectancy)
## [1] 264  60
# Extract GDP 2016 data
gdp2016 <- gdp$X2016
# View GDP 2016 values
View(gdp2016)
# Extract life expectancy 2016
le2016 <- life_expectancy$X2016
# Scatter plot GDP vs life expectancy
plot(gdp2016/1000, le2016/1000, main="GDP vs Life Expectancy 2016", xlab="GDP 2016", ylab="Life Expectancy 2016", col="red")

# Histogram of GDP values
hist(gdp2016, main="GDP Values", xlab="GDP 2016", ylab="Frequency", col="#3FFFFF")

# Boxplot for GDP
boxplot(gdp2016, horizontal=FALSE, main="GDP 2016", frame.plot=TRUE, col="red")

#Boxplot of  gdp,Life Expectancy
boxplot(gdp2016,horizontal =FALSE, main="GDP 2016",frame.plot=TRUE,col="RED")

boxplot(le2016,horizontal =FALSE, main="Life Expectancy 2016",frame.plot=TRUE,col="yellow")