Packages used.

install.packages("sqldf",repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4z/9kr7k7jd1q7gc792xjkrwf040000gn/T//Rtmp77qfg8/downloaded_packages
library(sqldf)  #sql package for doing sql queris
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
install.packages("dplyr",repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4z/9kr7k7jd1q7gc792xjkrwf040000gn/T//Rtmp77qfg8/downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Sets the directory where the files are located.

setwd("/Users/janakiramsundaraneedi/Desktop/sqlmidterm")
IMDB<-read.csv("final2.csv")
names(IMDB)
## [1] "X"          "length"     "name"       "title"      "year"      
## [6] "genre"      "studioname" "zipcode"
dim(IMDB)
## [1] 149   8

Creating differet data set from a single file. And we have movies, stars and starsIn dataset for quering,

movies<-select(IMDB, title, year, length, genre, studioname)
dim(movies)
## [1] 149   5
names(movies)
## [1] "title"      "year"       "length"     "genre"      "studioname"
head(movies)
##                                             title year length     genre
## 1                                    Spectre\xeb_ 2015    148 Adventure
## 2 Star Wars: Episode VII - The Force Awakens\xeb_   NA     NA Adventure
## 3                          Jupiter Ascending\xeb_ 2015    127 Adventure
## 4                      Six Days Seven Nights\xeb_ 1998     98     Crime
## 5                           Mighty Joe Young\xeb_ 1998    114     Crime
## 6                   The Manchurian Candidate\xeb_ 2015    129     Crime
##          studioname
## 1    Universum Film
## 2    Universum Film
## 3        True north
## 4 Sudha Productions
## 5 Sudha Productions
## 6 Sudha Productions
stars<-select(IMDB, name, zipcode)
dim(stars)
## [1] 149   2
names(stars)
## [1] "name"    "zipcode"
head(stars)
##              name zipcode
## 1 Christoph Waltz      13
## 2     Doug Walker      14
## 3  Channing Tatum      14
## 4   Harrison Ford       8
## 5 Charlize Theron       8
## 6      Tom Cruise       5
starsIn<-select(IMDB, title, year, name)
dim(starsIn)
## [1] 149   3
names(starsIn)
## [1] "title" "year"  "name"
head(starsIn)
##                                             title year            name
## 1                                    Spectre\xeb_ 2015 Christoph Waltz
## 2 Star Wars: Episode VII - The Force Awakens\xeb_   NA     Doug Walker
## 3                          Jupiter Ascending\xeb_ 2015  Channing Tatum
## 4                      Six Days Seven Nights\xeb_ 1998   Harrison Ford
## 5                           Mighty Joe Young\xeb_ 1998 Charlize Theron
## 6                   The Manchurian Candidate\xeb_ 2015      Tom Cruise

Midterm questions

1)Find the title of movies Harrison Ford starred in. solution:

title_movie_ford<-sqldf("select title from starsIn where name='Harrison Ford'")
## Loading required package: tcltk
title_movie_ford
##                        title
## 1 Six Days Seven Nights\xeb_
## 2          Ender's Game\xeb_

2)Find the name and zipcode of stars who have starred in a movie made by Sudha production

su_prod<-sqldf("SELECT name, zipcode from (select name,title,year From movies join starsIn USING(title, year) where studioname='Sudha Productions') join stars USING(name)")
su_prod
##               name zipcode
## 1    Harrison Ford       8
## 2    Harrison Ford      12
## 3  Charlize Theron       8
## 4       Tom Cruise       2
## 5       Tom Cruise       5
## 6       Tom Cruise      11
## 7       Tom Cruise      11
## 8       Tom Cruise      13
## 9       Tom Cruise      13
## 10      Tom Cruise      14
## 11      Tom Cruise      14
## 12      Tom Cruise      17
## 13      Tom Cruise      19
## 14      Tom Cruise      20
## 15      Tom Cruise      20
## 16  Robin Williams       7
## 17     Gary Oldman       9
## 18     Gary Oldman      13
## 19       Brad Pitt       5
## 20       Brad Pitt       5
## 21       Brad Pitt      19

3)Find the name of stars who have starred in both Disney movies and MGM movies.

s_pic<-sqldf("SELECT DISTINCT name
               FROM movies 
               JOIN starsIn USING(title, year) where studioname='S Pictures'")
sudha_prod<-sqldf("SELECT DISTINCT name
               FROM movies 
               JOIN starsIn USING(title, year) where studioname='Sudha Productions'")
name_stars_both_studionames<-intersect(s_pic, sudha_prod)
print(name_stars_both_studionames)
##         name
## 1 Tom Cruise

6)list the number of movies each star has starred in. Display the result in ascending order of star’s names.

name_adress<-sqldf("select name, count(title) AS count
from starsIn
group by name
order by name asc")
head(name_adress)
##            name count
## 1  A.J. Buckley     1
## 2 Abbie Cornish     1
## 3  Adam Sandler     1
## 4  Aidan Turner     1
## 5     Al Pacino     1
## 6   Alain Delon     1
tail(name_adress)
##                   name count
## 99           Tom Hanks     2
## 100        Tony Curran     1
## 101 Vincent Schiavelli     1
## 102      Vivica A. Fox     1
## 103         Will Smith     1
## 104      William Smith     1

8)Find the name of studio and the average length of movies it made in 2015 if the studio has made more than 1 movies in 2015.

studio_name<-sqldf("select studioname, avg(length) from movies where year=2015 group by studioName having count(*)>1")
head(studio_name)
##          studioname avg(length)
## 1    Bombay Talkies       137.5
## 2      Eskay Movies       125.5
## 3     Pannonia Film       152.0
## 4 Sudha Productions       129.5
tail(studio_name)
##          studioname avg(length)
## 1    Bombay Talkies       137.5
## 2      Eskay Movies       125.5
## 3     Pannonia Film       152.0
## 4 Sudha Productions       129.5