Chapter 5 Regression analysis
::opts_chunk$set(echo = TRUE)
knitr
library(reticulate)
use_condaenv("tf")
5.1 Importing python packages
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
import statistics
from scipy.stats import norm
from matplotlib.ticker import EngFormatter, StrMethodFormatter
The fundamental data type of NumPy is the array type called numpy.ndarray. The rest of this article uses the term array to refer to instances of the type numpy.ndarray.
from sklearn.datasets import fetch_california_housing
= fetch_california_housing(as_frame=True)
california_housing
print(california_housing.DESCR)
california_housing.data.head()
#Looks good - let's convert it into a pandas dataframe
= pd.DataFrame(california_housing.data)
california_housing_df
print(california_housing_df)
=(12, 10), bins=30, edgecolor="black") california_housing.frame.hist(figsize
=0.7, wspace=0.4)
plt.subplots_adjust(hspace
plt.show()
5.2 Create a model and fit it
The next step is to create the regression model as an instance of LinearRegression and fit it with .fit().
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
# Choose our variables of interest
= california_housing_df[['HouseAge']]
x = california_housing_df[['MedInc']]
y
# Make a model
= LinearRegression().fit(x, y)
model
# Analyse the model fit
= model.score(x, y)
r_sq print('coefficient of determination:', r_sq)
## coefficient of determination: 0.014169090760525749
print('intercept:', model.intercept_)
## intercept: [4.38527909]
print('slope:', model.coef_)
## slope: [[-0.01796848]]
5.3 Polynomial regression
We can fit different order polynomials by defining the relevant polynomial functions.
# Load in relevant packages
from numpy import arange
from pandas import read_csv
from scipy.optimize import curve_fit
from matplotlib import pyplot
# Define the true objective function for a linear estimation
def objective(x, a, b):
return a * x + b
# load the dataset
= 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/longley.csv'
url = read_csv(url, header=None)
dataframe = dataframe.values
data
# choose the input and output variables
= data[:, 4], data[:, -1]
x, y
# curve fit
= curve_fit(objective, x, y)
popt, _
# summarize the parameter values
= popt
a, b print('y = %.5f * x + %.5f' % (a, b))
# plot input vs output
## y = 0.48488 * x + 8.38067
="blue")
plt.scatter(x, y, c
# define a sequence of inputs between the smallest and largest known inputs
## <matplotlib.collections.PathCollection object at 0x186f613c0>
= arange(min(x), max(x), 1)
x_line
# calculate the output for the range
= objective(x_line, a, b)
y_line
# create a line plot for the mapping function
plt.plot(x_line, y_line,='Polynomial',
label='purple',
color=1,
alpha=1.2,
linewidth='dashed') linestyle
## [<matplotlib.lines.Line2D object at 0x186f62050>]
'Using a linear model to approximate data', fontsize=12) plt.title(
## Text(0.5, 1.0, 'Using a linear model to approximate data')
'', fontsize=10) plt.xlabel(
## Text(0.5, 0, '')
'', fontsize=10) plt.ylabel(
## Text(0, 0.5, '')
'top'].set_visible(False)
plt.gca().spines['bottom'].set_visible(True)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines[
plt.tick_params(='x', # changes apply to the x-axis
axis='both', # both major and minor ticks are affected
which=False, # ticks along the bottom edge are off
bottom=False, # ticks along the top edge are off
top=True) # labels along the bottom edge are off
labelbottom
plt.tick_params(='y', # changes apply to the y-axis
axis='both', # both major and minor ticks are affected
which=False, # ticks along the bottom edge are off
left=False, # ticks along the top edge are off
right=True) # labels along the bottom edge are off
labelleft
False)
plt.grid(True)
plt.gca().yaxis.grid(
=False, framealpha=1, shadow=False, borderpad=1) plt.legend(fancybox
## <matplotlib.legend.Legend object at 0x187f5d870>
'linear_model_chart.png',dpi=300,bbox_inches='tight')
plt.savefig(
plt.show()
Now let’s try a polynomial model
# Fit a second degree polynomial to the economic data
from numpy import arange
from pandas import read_csv
from scipy.optimize import curve_fit
from matplotlib import pyplot
# Define the true objective function
def objective(x, a, b, c):
return a * x + b * x**2 + c
# Load the dataset
= 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/longley.csv'
url = read_csv(url, header=None)
dataframe = dataframe.values
data
# choose the input and output variables
= data[:, 4], data[:, -1]
x, y
# curve fit
= curve_fit(objective, x, y)
popt, _
# summarize the parameter values
= popt
a, b, c print('y = %.5f * x + %.5f * x^2 + %.5f' % (a, b, c))
# plot input vs output
="blue")
plt.scatter(x, y, c
# define a sequence of inputs between the smallest and largest known inputs
= arange(min(x), max(x), 1)
x_line
# calculate the output for the range
= objective(x_line, a, b, c)
y_line
# create a line plot for the mapping function
# create a line plot for the mapping function
plt.plot(x_line, y_line,='Polynomial',
label='purple',
color=1,
alpha=1.2,
linewidth='dashed') linestyle
'Using a polynomial model to approximate data', fontsize=12) plt.title(
'', fontsize=10) plt.xlabel(
'', fontsize=10) plt.ylabel(
'top'].set_visible(False)
plt.gca().spines['bottom'].set_visible(True)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines[
plt.tick_params(='x', # changes apply to the x-axis
axis='both', # both major and minor ticks are affected
which=False, # ticks along the bottom edge are off
bottom=False, # ticks along the top edge are off
top=True) # labels along the bottom edge are off
labelbottom
plt.tick_params(='y', # changes apply to the y-axis
axis='both', # both major and minor ticks are affected
which=False, # ticks along the bottom edge are off
left=False, # ticks along the top edge are off
right=True) # labels along the bottom edge are off
labelleft
False)
plt.grid(True)
plt.gca().yaxis.grid(
=False, framealpha=1, shadow=False, borderpad=1) plt.legend(fancybox
'linear_model_chart.png',dpi=300,bbox_inches='tight')
plt.savefig(
plt.show()