In [ ]:
import pandas as pd
import numpy as np
#Import data into dataframe
#year
#month
#day (number of day in year)
#week (day of week)
#temp_2 (temp 2 days ago)
#temp_1 (temp yesterday)
#average (average historic temp for the day)
#actual (actual measure)
#friend (friend's prediction)
#
path = 'Desktop/temps.csv'
df = pd.read_csv(path, delimiter=',', header='infer')
#
#Data validation:
#Do we have complete data? No - some missing days
#Let's validate all the data looks sensible
#we could also plot the data to find anomalies
df.describe()
'''Out[4]:
year month ... forecast_under friend
count 348.0 348.000000 ... 348.000000 348.000000
mean 2016.0 6.477011 ... 59.772989 60.034483
std 0.0 3.498380 ... 10.705256 15.626179
min 2016.0 1.000000 ... 44.000000 28.000000
25% 2016.0 3.000000 ... 50.000000 47.750000
50% 2016.0 6.000000 ... 58.000000 60.000000
75% 2016.0 10.000000 ... 69.000000 71.000000
max 2016.0 12.000000 ... 79.000000 95.000000'''
#
#All columns must be numeric for the model - we can't include 'Mon, Tue...', so we can use pd.get_dummies to change the days of the week to be numeric
#if we map week days 1-7, the model can place a higher weighting on sunday (because it has the highest number).
#so instead, we make the columns binary, this is called hot encoding
#
df = pd.get_dummies(df)
#
'''df
Out[7]:
year month day temp_2 ... week_Sun week_Thurs week_Tues week_Wed
0 2016 1 1 45 ... 0 0 0 0
1 2016 1 2 44 ... 0 0 0 0
2 2016 1 3 45 ... 1 0 0 0
3 2016 1 4 44 ... 0 0 0 0
4 2016 1 5 41 ... 0 0 1 0
5 2016 1 6 40 ... 0 0 0 1
6 2016 1 7 44 ... 0 1 0 0
7 2016 1 8 51 ... 0 0 0 0
8 2016 1 9 45 ... 0 0 0 0
9 2016 1 10 48 ... 1 0 0 0
10 2016 1 11 50 ... 0 0 0 0
11 2016 1 12 52 ... 0 0 1 0
12 2016 1 13 45 ... 0 0 0 1
13 2016 1 14 49 ... 0 1 0 0
14 2016 1 15 55 ... 0 0 0 0
15 2016 1 16 49 ... 0 0 0 0
16 2016 1 17 48 ... 1 0 0 0'''
#
#Now, we need to get the results into a format that the model accepts. We need to convert the pandas dataframe to numpy arrays and split the actuals (labels) from the features
labels = np.array(df['actual']) #these are the actuals
features = df.drop('actual', axis=1) #remove the actual results from the dataframe
feature_list = list(df.columns) #these are column names for future reference
features = np.array(features) #convert the features dataframe to numpy array
#
#now we need to split the data into training and testing sets
#In training, the model has the actual output - so it can find links between the input features and the output
#In testing, it's blind to the actual results, so we can validate the accuracy of the model
from sklearn.model_selection import train_test_split #This is the sklearn module for splitting data
#
#In this example, we're extracting training and testing labels from our data
#random state means we will get the same results every time we run the split
#We should do this randomly, as if we selected the last 90 days of data from our 12 month dataset, the model would perform poorly.
#test size defines the test dataset as being 25% of the entire set
#It needs a representative example from the whole dataset.
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
#
#We can now print the shape of the extracted features. The training features and labels should match in length
#The test set is set to 25% of the entire dataset above
print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)
print(test_labels.shape)
#As we can see, the lengths are correct. Feature and label lengths match
(261, 17)
(261,)
(87, 17)
(87,)
#
#We now need to establish a baseline. That is, how accurate would our model be, if we just made random predictions (e.g. average max temps for days)
# So, let's take the values from the average column in our feature list
baseline = test_features[:, feature_list.index('average')]
# Now, we can simply take the average values and subtract from them the actual results (the test labels)
#from this, we'll know the variance between just predicting the daily average and what actually happened
baseline_errors = abs(baseline - test_labels)
#print the rounded values of the errors, which totals 5.06 degrees. This will be our baseline, we need to improve on that with our model
print(round(np.mean(baseline_errors), 2))
#
#Now we have all our data prepared and our baseline set, we can start with the model
from sklearn.ensemble import RandomForestRegressor
#Let's setup a random forest with 1000 estimators
randomforest = RandomForestRegressor(n_estimators = 1000, random_state = 42)
#now lets fit our model with training data
randomforest.fit(train_features, train_labels);
#
#Now we need to predict the actual outcomes, using the random forest model, rather than the averages we used above
#To do that, we run the below line, which passes the test_features into our random forest algorithm
predictions = randomforest.predict(test_features)
#we can now, as above, subtract actual results from the predictions, to give us our variance
errors = abs(predictions - test_labels)
#The variance is 3.87k which is lower than the 5.06 we got from the averages.
#lower variance means the model is more accurate, so it's a success
print(round(np.mean(errors), 2))
#
#Now lets work out the percentage accuracy of the model
#we take the percent errors:
percent_error = 100 * (errors / test_labels)
#So we can see that the percentage of error per datapoint is calculated
'''percent_error
Out[23]:
array([ 5.84393939, 0.52295082, 0.20961538, 7.08939394, 5.05285714,
14.43658537, 7.23764706, 9.77857143, 4.42923077, 19.48043478,
4.33934426, 15.08705882, 12.59545455, 3.94923077, 3.75942029,
9.65645161, 4.7765625 , 1.77678571, 7.07169811, 3.65443038,
1.2031746 , 4.55964912, 0.75671642, 0.75 , 0.36271186,
5.3 , 2.72153846, 10.67692308, 5.87575758, 5.49473684,
15.3640625 , 5.43114754, 14.975 , 1.2 , 3.66493506,
8.23684211, 7.52413793, 9.41276596, 1.36617647, 16.76078431,
8.59090909, 2.45178571, 4.34520548, 16.87058824, 3.5559322 ,
14.85057471, 9.06034483, 0.47037037, 8.06206897, 1.12142857,
5.0877551 , 5.265 , 1.27692308, 2.6609375 , 5.9 ,
5.63230769, 4.10754717, 5.16666667, 3.11509434, 0.56 ,
2.27761194, 2.16530612, 14.80943396, 1.72173913, 9.20363636,
12.88823529, 5.43684211, 1.15362319, 1.5 , 14.77777778,
3.87567568, 0.38070175, 12.49565217, 0.936 , 2.81071429,
4.49104478, 8.93541667, 7.0575 , 1.44583333, 6.7877551 ,
6.09473684, 11.41558442, 9.12716049, 8.44477612, 4.17727273,
0.22105263, 2.31111111])'''
#
# we then take the average of those error percentages to determine the model accuracy. You can see the model is 93.93% accurate
accuracy = 100 - np.mean(percent_error)
'''accuracy
Out[25]: 93.93703038326089'''
#
# let's find out the weightings for each of the features in the model
importances = list(randomforest.feature_importances_)
#
#We can now export a list of the features, with their importance
#You can see that temp_1 is the most important feature - which is the temperature of yesterday
#In subsequent models, we can remove the features that have no importance to the model
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
'''[('year', 0.0),
('month', 0.01),
('day', 0.02),
('temp_2', 0.02),
('temp_1', 0.66),
('average', 0.15),
('actual', 0.05),
('forecast_noaa', 0.03),
('forecast_acc', 0.02),
('forecast_under', 0.02),
('friend', 0.0),
('week_Fri', 0.0),
('week_Mon', 0.0),
('week_Sat', 0.0),
('week_Sun', 0.0),
('week_Thurs', 0.0),
('week_Tues', 0.0)]'''
#
#We can now visually look at the importance of each feature
#Import matplotlib
import matplotlib.pyplot as plt
#Enable plotting in ipython
%pylab
#Add the xvalues
x_values = list(range(len(importances)))
#Plot the chart
plt.bar(x_values, importances, orientation = 'vertical')
#we need to add feature names to X
plt.xticks(x_values, feature_list, rotation='vertical')
#Now we give the y axis the features it needs
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');