Assignment 6

With this assignment, we’ll take a look at some of what the numpy module can do. Do both of the following:

Using your submission of homework 1 as a base, replace as many of the functions as you can with numpy functions. For example, instead of using your sort function that you wrote, use numpy.sort. Refer to here for most of the functions you’ll need.
Using the timeit function measure the execution times of all the sort and search functions you have. You’ll most likely need to do a large number of tests on each one to get a meaningful result. Something like 10000 or more.

Your submission will be a single file that has all the functions from homework 1 and the additional approach using numpy. Additionally, you will have the timing of all the functions output to the console.

import timeit

code = """
import numpy, copy

def sortwithloops(input):
    output = input
    for n in range(len(output) - 1, 0, -1):
        for i in range(n):
            if output[i] > output[i + 1]:
                temp = output[i]
                output[i] = output[i + 1]
                output[i + 1] = temp
    return output
    
def sortwithoutloops(input):
    output = input
    output.sort()
    return output

def sortwithnumpy(input):
    output = input
    return numpy.sort(output)

def searchwithloops(input, value):
    for i in range(1, len(input)):
        if value == input [i]:
            output = True
            break
        else:
            output = False
    return output

def searchwithoutloops(input, value):
    output = value in input
    return output

def searchwithnumpy(input, value):
    output = input
    return numpy.any(value == output)

L = [5,3,6,3,13,5,6]
A = numpy.array(L)
"""

n = 10000
print "SORTING"
t = timeit.Timer("x = copy.copy(L); sortwithloops(x)", setup = code)
print "Sort using iteration        :", n, "loops =", t.timeit(n), "seconds"
t = timeit.Timer("x = copy.copy(L); sortwithoutloops(x)", setup = code)
print "Sort using built in python  :", n, "loops =", t.timeit(n), "seconds"
t = timeit.Timer("x = copy.copy(A); sortwithnumpy(x)", setup = code)
print "Sort using numpy            :", n, "loops =", t.timeit(n), "seconds"

print "\nSEARCHING"
t = timeit.Timer("x = copy.copy(L); searchwithloops(x, 5)", setup = code)
print "Search using iteration      :", n, "loops =", t.timeit(n), "seconds"
t = timeit.Timer("x = copy.copy(L); searchwithoutloops(x, 5)", setup = code)
print "Search using built in python:", n, "loops =", t.timeit(n), "seconds"
t = timeit.Timer("x = copy.copy(A); searchwithnumpy(x, 5)", setup = code)
print "Search using numpy          :", n, "loops =", t.timeit(n), "seconds"

## SORTING
## Sort using iteration        : 10000 loops = 0.159836472178 seconds
## Sort using built in python  : 10000 loops = 0.0284274986743 seconds
## Sort using numpy            : 10000 loops = 0.0735014226008 seconds
## 
## SEARCHING
## Search using iteration      : 10000 loops = 0.0384250468974 seconds
## Search using built in python: 10000 loops = 0.0273891996374 seconds
## Search using numpy          : 10000 loops = 0.154666643859 seconds

Assignment 7

Take what you did on homework 5 as a starting point (using any of the provided datasets). Replace the regression calculation using least squares with a curve fitting approach (examples in the reading). To start, just fit a linear equation. Output the equation to the console. You don’t need to graph anything (we’ll look at that in a couple more weeks).

Again, using timeit, compare the performance of your solution in homework 5 to the scipy function. Output the results to the console.

import timeit, Tkinter, tkFileDialog, csv, pandas, numpy
from scipy.optimize import curve_fit

# IMPORT CSV FILE
root = Tkinter.Tk()
root.withdraw()
filePath = tkFileDialog.askopenfilename(parent = root)

def manual_regression(filePath):
    infileName = open(filePath)
    brainandbody = csv.reader(infileName)
    next(brainandbody) # Skips first row of CSV file
    br = [] # X-variable
    bo = [] # Intercept
    n = 0
    for col in brainandbody:
        n += 1
        bo.append(float(col[1]))
        br.append(float(col[2]))
    infileName.close()
    mu_br =  sum(br) / n
    mu_bo =  sum(bo) / n
    sum_XX = sum([i ** 2 for i in br])
    sum_YY = sum([i ** 2 for i in bo])
    sum_XY = sum([a * b for a, b in zip(br, bo)])
    ss_x = sum_XX - n * mu_br ** 2
    ss_y = sum_YY - n * mu_bo ** 2
    ss_xy = sum_XY - n * mu_br * mu_bo
    beta_1 = ss_xy / ss_x
    beta_0 = mu_bo - beta_1 * mu_br
    return "Model1: bo = ", beta_1, "* br ", "+" if beta_0 > 0 else "-", abs(beta_0)

def scipy_linregress(filePath):
    infileName = open(filePath)
    brainandbody = pandas.read_csv(infileName)
    from scipy import stats
    br = brainandbody["brain"] # X-data
    bo = brainandbody["body"] # Y-data
    infileName.close()
    slope, intercept, r_value, p_value, std_err = stats.linregress(br, bo)
    return "Model2: bo = ", slope, "* br ", "+" if intercept > 0 else "-", abs(intercept)

def scipy_curve_fit(filePath):
    infileName = open(filePath)
    brainandbody = pandas.read_csv(infileName)
    from scipy import stats
    br = brainandbody["brain"] # X-data
    bo = brainandbody["body"] # Y-data
    infileName.close()
    def line(x, a, b):
        return a * x + b
    fitParams, fitCovariances = curve_fit(line, br, bo)
    return "Model3: bo = ", fitParams[0], "* br ", "+" if fitParams[1] > 0 else "-", abs(fitParams[1])

n = 10**4
print "REGRESSION"
t = timeit.Timer(lambda: manual_regression(filePath))
print "Derived with built in functions:", n, "loops =", t.timeit(n), "seconds"
t = timeit.Timer(lambda: scipy_linregress(filePath))
print "Derived using SciPy linregress :", n, "loops =", t.timeit(n), "seconds"
t = timeit.Timer(lambda: scipy_curve_fit(filePath))
print "Derived using SciPy curve_fit  :", n, "loops =", t.timeit(n), "seconds"

## REGRESSION
## Derived with built in functions: 10000 loops = 4.01855082876 seconds
## Derived using SciPy linregress : 10000 loops = 43.4578431205 seconds
## Derived using SciPy curve_fit  : 10000 loops = 99.7825319444 seconds

Assignment 8

This homework will give you a chance to explore some image processing techniques in Python. These are some of the most basic tasks done in image processing. First, download the image package attached to this lesson. On each image you will count the number of objects in the image and find their center points. The images in order of complexity are circles.png, objects.png, and peppers.png. Using Python’s builtin functionality, scipy, or any other module, perform the following tasks:

Circles	Objects	Peppers

Thresholding: First convert the image to a binary image. This is done with a technique called thresholding, which is covered in the reading. There are functions for it in scipy, although it is very easy to do manually. Essentially read each pixel and if it above a specified gray level make it white, otherwise make it black.
Count objects: Count the number of objects in the image. If you are interested in how this is done, refer to the additional readings. An object will be a group of white pixels surrounded by black pixels. Doing this by hand is also fairly easy, but try to use functions found in the modules available.
Find center points: For each object, find the center point in terms of \(x,y\) coordinates. As with part 3, you can do this directly, but it’s better to use something from a module.

Image files can be read in directly or you can use a dialog box. Your output will list the objects and midpoints for each image. Remember, the focus here is to use readily available Python functions to do image processing rather than gain a deep understanding of the theory of the techniques. For the peppers.png image, I’m not expecting a specific number, as the answer is fairly subjective. The focus is on the process, not the result.

import scipy.ndimage as ndimage
import scipy.misc as misc
import urllib2, cStringIO, numpy

item = ["Circle", "Object", "Pepper"]
url = ["https://raw.githubusercontent.com/jzuniga123/SPS/master/DATA%20602/circles.png",
       "https://raw.githubusercontent.com/jzuniga123/SPS/master/DATA%20602/objects.png",
       "https://raw.githubusercontent.com/jzuniga123/SPS/master/DATA%20602/peppers.png"]
outfile = [".\DATA_602_HW08_1.png", ".\DATA_602_HW08_2.png", ".\DATA_602_HW08_3.png"]

# THRESHOLDING
std_dev = [10, 2, 4] # lower number gives more detail
for i in range(0, 3):
    infile = cStringIO.StringIO(urllib2.urlopen(url[i]).read())
    raw = misc.imread(infile)
    img = ndimage.gaussian_filter(raw, std_dev[i])
    if i == 0:
        thres = img > numpy.percentile(img, 90)
    elif i == 1:
        thres = img > img.mean()
    else:
        thres = img > numpy.percentile(img, 75)
    misc.imsave(outfile[i], thres)

# COUNTING (5, 8, ~29)
for i in range(0, 3):
    img = misc.imread(outfile[i])
    labels, count = ndimage.label(img)
    print "Number of", item[i] + "s:", count

# CENTER OF MASS
for i in range(0, 3):
    img = misc.imread(outfile[i])
    labels, count = ndimage.label(img)
    index = list(range(1, count))
    center = ndimage.measurements.center_of_mass(img, labels, index)
    for j in range(0, count):
        print item[i], j+1, "Center of Mass:", center[j-1][0:2]

## Number of Circles: 5
## Number of Objects: 8
## Number of Peppers: 30
## Circle 1 Center of Mass: (299.50566414926698, 361.97823189693469)
## Circle 2 Center of Mass: (124.14465408805032, 369.34835779175404)
## Circle 3 Center of Mass: (150.09339805825243, 522.39281553398064)
## Circle 4 Center of Mass: (203.49263674614306, 212.48316970546983)
## Circle 5 Center of Mass: (299.50566414926698, 361.97823189693469)
## Object 1 Center of Mass: (399.33568904593642, 93.738515901060069)
## Object 2 Center of Mass: (73.500359350294673, 509.29711082363087)
## Object 3 Center of Mass: (159.33661650706793, 181.12398540811674)
## Object 4 Center of Mass: (148.20474842082336, 409.8503593988238)
## Object 5 Center of Mass: (310.37425552353506, 159.43189241114314)
## Object 6 Center of Mass: (371.48889659099348, 419.52726957641175)
## Object 7 Center of Mass: (295.4646017699115, 267.92035398230087)
## Object 8 Center of Mass: (399.33568904593642, 93.738515901060069)
## Pepper 1 Center of Mass: (502.67257177764202, 500.49175320708611)
## Pepper 2 Center of Mass: (8.2934895833333329, 152.90755208333334)
## Pepper 3 Center of Mass: (11.912353757683919, 288.72853460241919)
## Pepper 4 Center of Mass: (44.072585187271194, 386.35831455927905)
## Pepper 5 Center of Mass: (12.581196581196581, 498.95840455840454)
## Pepper 6 Center of Mass: (11.524752475247524, 104.01485148514851)
## Pepper 7 Center of Mass: (38.625260235947259, 23.508501040943788)
## Pepper 8 Center of Mass: (66.845660377358485, 506.0147169811321)
## Pepper 9 Center of Mass: (54.21965317919075, 81.100367840252233)
## Pepper 10 Center of Mass: (62.903954802259889, 180.18644067796609)
## Pepper 11 Center of Mass: (80.609544468546645, 26.216919739696312)
## Pepper 12 Center of Mass: (81.043478260869563, 55.444444444444443)
## Pepper 13 Center of Mass: (354.87987041983649, 125.09610528794239)
## Pepper 14 Center of Mass: (168.48866476258479, 461.6241520885398)
## Pepper 15 Center of Mass: (133.61088871096877, 17.914931945556447)
## Pepper 16 Center of Mass: (128.85532919988191, 320.16268083850014)
## Pepper 17 Center of Mass: (160.78849315068493, 232.79342465753425)
## Pepper 18 Center of Mass: (157.27696793002914, 91.935860058309032)
## Pepper 19 Center of Mass: (171.48648648648648, 358.62642642642641)
## Pepper 20 Center of Mass: (167.26666666666668, 510.33333333333331)
## Pepper 21 Center of Mass: (242.95094516481799, 299.14888298703329)
## Pepper 22 Center of Mass: (290.29381709998705, 385.66608459448969)
## Pepper 23 Center of Mass: (238.02985074626866, 272.19402985074629)
## Pepper 24 Center of Mass: (239.66054054054055, 360.68324324324323)
## Pepper 25 Center of Mass: (262.66748971193414, 233.09465020576133)
## Pepper 26 Center of Mass: (311.30473186119872, 503.37602523659308)
## Pepper 27 Center of Mass: (313.3392058165548, 293.07410514541385)
## Pepper 28 Center of Mass: (464.04229166666664, 450.15354166666668)
## Pepper 29 Center of Mass: (463.66666666666669, 378.3388888888889)
## Pepper 30 Center of Mass: (502.67257177764202, 500.49175320708611)

Circles (\(\sigma=10\))	Objects (\(\sigma=2\))	Peppers (\(\sigma=4\))

Assignment 9

The EPA-HTTP trace contains a day’s worth of all HTTP requests to the EPA WWW server located at Research Triangle Park, NC. Use the pandas module to answer the following questions about the EPA-HTTP data set. Print the result of each part to the console. Use pandas as much as you can; this includes the data structure and the analysis.

Which hostname or IP address made the most requests?
Which hostname or IP address received the most total bytes from the server? How many bytes did it receive?
During what hour was the server the busiest in terms of requests? (You can do this by grouping each hour period e.g. 13:00 - 14:00. Then count the number of requests in each hour)
Which .gif image was downloaded the most during the day?
What HTTP reply codes were sent other than 200?

Use any other tools or techniques you need to create an efficient program. These include scipy, numpy, regex, Tkinter, etc.

import pandas, datetime, re

url = "https://raw.githubusercontent.com/jzuniga123/SPS/master/DATA%20602/epa-http.txt"

# IMPORT RAW DATA AND REMOVE SOLITARY DOUBLE QUOTES
dirty = pandas.read_table(url, header = None, names = ['raw'])
clean = dirty.replace('=\\"\sH', '=\sH', regex=True)
data = pandas.DataFrame(clean, columns = ['raw'])

# BUILD DATA FRAME COLUMN-BY-COLUMN FROM RAW DATA
data['host'] = data['raw'].str.extract('(^\S+)', expand=True)
data['date'] = data['raw'].str.extract('(\[\S+\])', expand=True)
data['date'] = pandas.to_datetime(data['date']+"081995", format='[%d:%H:%M:%S]%m%Y')
data['request'] = data['raw'].str.extract('(\\".+\\")', expand=True)
data['request'] = data['request'].replace('^"|"$', '', regex=True)
data['reply'] = data['raw'].str.extract('((?<=\\"\s)\w+(?=\s[-|\d]))', expand=True)
data['reply'] = pandas.to_numeric(data['reply'])
data['bytes'] = data['raw'].str.extract('(\S+$)', expand=True)
data['bytes'] = pandas.to_numeric(data['bytes'].replace('-', '0', regex=True))
data = data.drop('raw', 1)

# QUESTIONS
requests = data.groupby('host')['host'].agg({'requests': 'count'})
print requests.sort_values('requests').tail(1), "\n" + "="*40
requests = data.groupby('host')['bytes'].agg({'bytes_sum': 'sum'})
print requests.sort_values('bytes_sum').tail(1), "\n" + "="*40
data['hour'] = pandas.DatetimeIndex(data['date']).hour
requests = data.groupby('hour')['hour'].agg({'requests': 'count'})
print requests.sort_values('requests').tail(1), "\n" + "="*40
gifs = []
for i in range(0, len(data)):
    gif = re.findall("[\w|_]+\.gif", data['request'][i])
    if len(gif) == 0:
        gifs.append(None)
    else:
        gifs.append(gif[0])
data['gif'] = gifs
requests = data.groupby('gif')['gif'].agg({'requests': 'count'})
print requests.sort_values('requests').tail(1), "\n" + "="*40
replies = data[data['reply'] != 200].groupby('reply')['reply'].agg({'requests': 'count'})
print replies.sort_values('requests'), "\n" + "="*40

##                        requests
## host                           
## sandy.rtptok1.epa.gov       294 
## ========================================
##                          bytes_sum
## host                              
## piankhi.cs.hamptonu.edu    7267751 
## ========================================
##       requests
## hour          
## 14        4716 
## ========================================
##                        requests
## gif                            
## circle_logo_small.gif      3241 
## ========================================
##        requests
## reply          
## 400           6
## 500          69
## 403         272
## 501         272
## 404         611
## 302        4506
## 304        5300 
## ========================================