import pickle
import csv
import calendar
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import  MultipleLocator, FormatStrFormatter
from scipy.interpolate import spline
from IPython.core.display import display_html
from keras.models import load_model
from code.utils import np_haversine, density_map, get_clusters, plot_embeddings
from code.data import load_data
from code.training import start_new_session, process_features, create_model

# Display plots inline
%matplotlib inline

# Fix random seed for reproducibility
np.random.seed(42)


data = load_data()


print data.train.shape
print data.validation.shape
print data.test.shape

(1611521, 17)
(16444, 17)
(16445, 17)


data.train.head(3)


# Percentage of taxi rides started at taxi stands
100 * pd.notnull(data.train['ORIGIN_STAND']).sum() / float(data.train.shape[0])

48.00185663109572


plt.figure(figsize=(7.5,4))
plt.xticks(rotation=90, fontsize=7)
sns.countplot(data.train['ORIGIN_STAND'].dropna().astype(int))
plt.show()


for stand_id in [15, 57]:
    lat, long = data.train[data.train['ORIGIN_STAND'] == stand_id][['START_LAT', 'START_LONG']].mean()
    display_html(
        '<a href="https://www.google.com/maps/?q={lat},{long}" target="_blank">Stand #{stand_id}</a>'.format(
            lat=lat, long=long, stand_id=stand_id), raw=True)

datetime_index = pandas.DatetimeIndex(dataframe['TIMESTAMP'])
dataframe['WEEK_OF_YEAR'] = datetime_index.weekofyear - 1
dataframe['DAY_OF_WEEK'] = datetime_index.dayofweek
dataframe['QUARTER_HOUR'] = datetime_index.hour * 4 + datetime_index.minute / 15


plt.figure(figsize=(7.5,4))
sns.countplot(data.train['WEEK_OF_YEAR'])
plt.gca().xaxis.set_major_locator(MultipleLocator(10))
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%d'))
plt.xlabel('Week of the year')
plt.show()


plt.figure(figsize=(7.5,4))
sns.countplot(data.train['DAY_OF_WEEK'])
plt.gca().set_xticklabels(calendar.day_name)
plt.xticks(fontsize=8)
plt.xlabel('Day of the week')
plt.show()


plt.figure(figsize=(7.5,4))
sns.countplot(data.train['QUARTER_HOUR'], color='royalblue')
plt.gca().xaxis.set_major_locator(MultipleLocator(10))
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%d'))
plt.xticks(fontsize=9)
plt.xlabel('Quarter hour of the day')
plt.show()


plt.figure(figsize=(7.5,4))
bins = np.arange(60, data.train.DURATION.max(), 60)
binned = pd.cut(data.train.DURATION, bins, labels=bins[:-1]/60, include_lowest=True)
sns.countplot(binned, color='royalblue')
plt.gca().xaxis.set_major_locator(MultipleLocator(5))
plt.gca().xaxis.set_major_formatter(FormatStrFormatter('%d'))
plt.xlim(-1, 40)
plt.xticks(fontsize=9)
plt.xlabel('Duration (in minutes)')
plt.show()


all_coords = np.concatenate(data.train['POLYLINE_FULL'].as_matrix())
density_map(all_coords[:,0], all_coords[:,1])

def create_model(metadata, clusters):
    """
    Creates all the layers for our neural network model.
    """

    # Arbitrary dimension for all embeddings
    embedding_dim = 10

    # Quarter hour of the day embedding
    embed_quarter_hour = Sequential()
    embed_quarter_hour.add(Embedding(metadata['n_quarter_hours'], embedding_dim, input_length=1))
    embed_quarter_hour.add(Reshape((embedding_dim,)))

    # Day of the week embedding
    embed_day_of_week = Sequential()
    embed_day_of_week.add(Embedding(metadata['n_days_per_week'], embedding_dim, input_length=1))
    embed_day_of_week.add(Reshape((embedding_dim,)))

    # Week of the year embedding
    embed_week_of_year = Sequential()
    embed_week_of_year.add(Embedding(metadata['n_weeks_per_year'], embedding_dim, input_length=1))
    embed_week_of_year.add(Reshape((embedding_dim,)))

    # Client ID embedding
    embed_client_ids = Sequential()
    embed_client_ids.add(Embedding(metadata['n_client_ids'], embedding_dim, input_length=1))
    embed_client_ids.add(Reshape((embedding_dim,)))

    # Taxi ID embedding
    embed_taxi_ids = Sequential()
    embed_taxi_ids.add(Embedding(metadata['n_taxi_ids'], embedding_dim, input_length=1))
    embed_taxi_ids.add(Reshape((embedding_dim,)))

    # Taxi stand ID embedding
    embed_stand_ids = Sequential()
    embed_stand_ids.add(Embedding(metadata['n_stand_ids'], embedding_dim, input_length=1))
    embed_stand_ids.add(Reshape((embedding_dim,)))

    # GPS coordinates (5 first lat/long and 5 latest lat/long, therefore 20 values)
    coords = Sequential()
    coords.add(Dense(1, input_dim=20, init='normal'))

    # Merge all the inputs into a single input layer
    model = Sequential()
    model.add(Merge([
                embed_quarter_hour,
                embed_day_of_week,
                embed_week_of_year,
                embed_client_ids,
                embed_taxi_ids,
                embed_stand_ids,
                coords
            ], mode='concat'))

    # Simple hidden layer
    model.add(Dense(500))
    model.add(Activation('relu'))

    # Determine cluster probabilities using softmax
    model.add(Dense(len(clusters)))
    model.add(Activation('softmax'))

    # Final activation layer: calculate the destination as the weighted mean of cluster coordinates
    cast_clusters = K.cast_to_floatx(clusters)
    def destination(probabilities):
        return tf.matmul(probabilities, cast_clusters)
    model.add(Activation(destination))

    # Compile the model
    optimizer = SGD(lr=0.01, momentum=0.9, clipvalue=1.)  # Use `clipvalue` to prevent exploding gradients
    model.compile(loss=tf_haversine, optimizer=optimizer)

    return model

clusters = pd.DataFrame({
    'approx_latitudes': destinations[:,0].round(4),
    'approx_longitudes': destinations[:,1].round(4)
})


# Estimate clusters from all destination points
clusters = get_clusters(data.train_labels)
print("Number of estimated clusters: %d" % len(clusters))

Number of estimated clusters: 5451


plt.figure(figsize=(6,6))
plt.scatter(clusters[:,1], clusters[:,0], c='#cccccc', s=2)
plt.axis('off')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.gca().autoscale_view('tight')


plt.figure(figsize=(6,6))
plt.scatter(clusters[:,1], clusters[:,0], c='#99cc99', edgecolor='None', alpha=0.7, s=40)
plt.scatter(data.train_labels[:,1], data.train_labels[:,0], c='k', alpha=0.2, s=1)
plt.grid('off')
plt.axis('off')
plt.gca().xaxis.set_visible(False)
plt.gca().yaxis.set_visible(False)
plt.gca().autoscale_view('tight')


with open('history.pickle', 'rb') as handle:
    history = pickle.load(handle)

# Interpolate a smooth curve from the raw validation loss
n_epochs = len(history['val_loss'])
x_smooth = np.linspace(0, n_epochs-1, num=10)
y_smooth = spline(range(n_epochs), history['val_loss'], x_smooth)

plt.figure(figsize=(7.5,4))
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.plot(x_smooth, y_smooth)
plt.title('Evolution of loss values during training')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.xticks(fontsize=9)
plt.axes().xaxis.set_major_locator(MultipleLocator(10))
plt.legend(['train', 'validation', 'smoothened validation'], loc='upper right')
plt.show()


start_new_session()
model = create_model(data.metadata, clusters)
model.load_weights('model-weights.hdf5')


validation_predictions = model.predict(process_features(data.validation))
np_haversine(validation_predictions, data.validation_labels).mean()

1.9707780925774365


test_predictions = model.predict(process_features(data.test))
np_haversine(test_predictions, data.test_labels).mean()

2.0029187354900984

	TRIP_ID	CALL_TYPE	ORIGIN_CALL	ORIGIN_STAND	TAXI_ID	TIMESTAMP	POLYLINE	START_LAT	START_LONG	QUARTER_HOUR	DAY_OF_WEEK	WEEK_OF_YEAR	DURATION	TAXI_ID_ENCODED	ORIGIN_STAND_ENCODED	POLYLINE_FULL
1337703	1397770496620000648	B	NaN	15.0	20000648	2014-04-17 21:34:56	[(41.148531, -8.585649), (41.148594, -8.585631...	41.148531	-8.585649	86	3	15	780	401	15	[(41.148531, -8.585649), (41.148594, -8.585631...
1619778	1402607008620000157	B	NaN	40.0	20000157	2014-06-12 21:03:28	[(41.153823, -8.67411), (41.154237, -8.673912)...	41.153823	-8.674110	84	3	23	675	107	40	[(41.153823, -8.67411), (41.154237, -8.673912)...
1702294	1404009216620000557	B	NaN	27.0	20000557	2014-06-29 02:33:36	[(41.147703, -8.608734), (41.147757, -8.608482...	41.147703	-8.608734	10	6	25	645	349	27	[(41.147703, -8.608734), (41.147757, -8.608482...

Kaggle competition report. ECML PKDD 2015 Taxi Trajectory Prediction

Loading the data¶

Exploring the data¶

Building the model¶

Training the model¶

Future improvements¶