1 單變量線性迴歸

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
path = 'ex1data1.txt'
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
data.head()
##    Population   Profit
## 0      6.1101  17.5920
## 1      5.5277   9.1302
## 2      8.5186  13.6620
## 3      7.0032  11.8540
## 4      5.8598   6.8233
data.describe()
##        Population     Profit
## count   97.000000  97.000000
## mean     8.159800   5.839135
## std      3.869884   5.510262
## min      5.026900  -2.680700
## 25%      5.707700   1.986900
## 50%      6.589400   4.562300
## 75%      8.578100   7.046700
## max     22.203000  24.147000
data.plot(kind='scatter', x='Population', y='Profit', figsize=(12,8))
plt.show()

Cost Function \[J\left( \theta \right)=\frac{1}{2m}\sum\limits_{i=1}^{m}{{{\left( {{h}_{\theta }}\left( {{x}^{(i)}} \right)-{{y}^{(i)}} \right)}^{2}}}\] 其中: \[{{h}_{\theta }}\left( x \right)={{\theta }^{T}}X={{\theta }_{0}}{{x}_{0}}+{{\theta }_{1}}{{x}_{1}}+{{\theta }_{2}}{{x}_{2}}+...+{{\theta }_{n}}{{x}_{n}}\]

def computeCost(X, y, theta):
    inner = np.power(((X * theta.T) - y), 2)
    return np.sum(inner) / (2 * len(X))
data.insert(0, 'Ones', 1)
data.head()
##    Ones  Population   Profit
## 0     1      6.1101  17.5920
## 1     1      5.5277   9.1302
## 2     1      8.5186  13.6620
## 3     1      7.0032  11.8540
## 4     1      5.8598   6.8233
# set X (training data) and y (target variable)
cols = data.shape[1]
X = data.iloc[:,0:cols-1]#X是所有行,去掉最後一列
y = data.iloc[:,cols-1:cols]#y是所有行,最後一列
X.head()
##    Ones  Population
## 0     1      6.1101
## 1     1      5.5277
## 2     1      8.5186
## 3     1      7.0032
## 4     1      5.8598
y.head()
##     Profit
## 0  17.5920
## 1   9.1302
## 2  13.6620
## 3  11.8540
## 4   6.8233
# 資料轉換array -> matrix
X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.matrix(np.array([0,0]))
X.shape, y.shape, theta.shape
## ((97, 2), (97, 1), (1, 2))
computeCost(X, y, theta)
## 32.072733877455676

2 batch gradient decent(批量梯度下降)

\[{{\theta }_{j}}:={{\theta }_{j}}-\alpha \frac{\partial }{\partial {{\theta }_{j}}}J\left( \theta \right)\]

def gradientDescent(X, y, theta, alpha, iters):
    temp = np.matrix(np.zeros(theta.shape))
    parameters = int(theta.ravel().shape[1])
    cost = np.zeros(iters)
    
    for i in range(iters):
        error = (X * theta.T) - y
        
        for j in range(parameters):
            term = np.multiply(error, X[:,j])
            temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))
            
        theta = temp
        cost[i] = computeCost(X, y, theta)
        
    return theta, cost
alpha = 0.01
iters = 1000
g, cost = gradientDescent(X, y, theta, alpha, iters)
g
## matrix([[-3.24140214,  1.1272942 ]])
computeCost(X, y, g)
## 4.515955503078914
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + (g[0, 1] * x)

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

3 多變量線性回歸

path =  'ex1data2.txt'
data2 = pd.read_csv(path, header=None, names=['Size', 'Bedrooms', 'Price'])
data2.head()
##    Size  Bedrooms   Price
## 0  2104         3  399900
## 1  1600         3  329900
## 2  2400         3  369000
## 3  1416         2  232000
## 4  3000         4  539900
data2.describe()
##               Size   Bedrooms          Price
## count    47.000000  47.000000      47.000000
## mean   2000.680851   3.170213  340412.659574
## std     794.702354   0.760982  125039.899586
## min     852.000000   1.000000  169900.000000
## 25%    1432.000000   3.000000  249900.000000
## 50%    1888.000000   3.000000  299900.000000
## 75%    2269.000000   4.000000  384450.000000
## max    4478.000000   5.000000  699900.000000
data2 = (data2 - data2.mean()) / data2.std()
data2.head()
##        Size  Bedrooms     Price
## 0  0.130010 -0.223675  0.475747
## 1 -0.504190 -0.223675 -0.084074
## 2  0.502476 -0.223675  0.228626
## 3 -0.735723 -1.537767 -0.867025
## 4  1.257476  1.090417  1.595389
data2.describe()
##                Size      Bedrooms         Price
## count  4.700000e+01  4.700000e+01  4.700000e+01
## mean   3.779483e-17  2.746030e-16 -9.684924e-17
## std    1.000000e+00  1.000000e+00  1.000000e+00
## min   -1.445423e+00 -2.851859e+00 -1.363666e+00
## 25%   -7.155897e-01 -2.236752e-01 -7.238702e-01
## 50%   -1.417900e-01 -2.236752e-01 -3.239979e-01
## 75%    3.376348e-01  1.090417e+00  3.521863e-01
## max    3.117292e+00  2.404508e+00  2.874981e+00
# add ones column
data2.insert(0, 'Ones', 1)

# set X (training data) and y (target variable)
cols = data2.shape[1]
X2 = data2.iloc[:,0:cols-1]
y2 = data2.iloc[:,cols-1:cols]

# convert to matrices and initialize theta
X2 = np.matrix(X2.values)
y2 = np.matrix(y2.values)
theta2 = np.matrix(np.array([0,0,0]))

# perform linear regression on the data set
g2, cost2 = gradientDescent(X2, y2, theta2, alpha, iters)

# get the cost (error) of the model
computeCost(X2, y2, g2)
## 0.1307033696077189
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters), cost2, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()

from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(X, y)
## LinearRegression()
## 
## /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:593: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html
##   warnings.warn(
## /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:593: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html
##   warnings.warn(
x = np.array(X[:, 1].A1)
f = model.predict(X).flatten()
## /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:593: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html
##   warnings.warn(
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()

4 normal equation(正規方程)

# 正規方程
def normalEqn(X, y):
    theta = np.linalg.inv(X.T@X)@X.T@y#X.T@X等價於X.T.dot(X)
    return theta
final_theta2=normalEqn(X, y)#感觉和批量梯度下降的theta的值有点差距
final_theta2
## matrix([[-3.89578088],
##         [ 1.19303364]])