## Maximum depth of the orderbook to consider. 
## Orders at levels greater than maxdepth will be discarded
## Use -1 to use all levels
maxdepth_parse=10
maxdepth_flatten=10

## Frequency to resample the dataframes (seconds).
## Use -1 to skip resampling
resample_freq = -1 # int(np.floor(pd.Series(ob.index).diff().describe()['25%'].total_seconds()))

## Weight the prices and volumes of each LOB level. See function calculate_weights()
orderbook_depth_weights = 'none'

## The quantity we want to predict
target_side = 'mids'

## Predict the target N steps ahead.
## If it is an array with more than one element, populate the dataframe with a column for each element
## The prediction is for the last element of the array
n_steps_future = [0,5]

## The last date of the dataset we want to analyse
## Leave empty to use the whole dataset
last_date = ''

## Merge trades with order books
merge_ob_trades = False

## Parameters for spread estimation
gamma=0.1
kappa=1.5
std_window = 1000


import gc
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import numpy as np
import pandas as pd
pd.set_option('precision',12)
pd.options.plotting.backend = 'matplotlib'
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

import xgboost
from sklearn.utils import class_weight
from sklearn import model_selection
import sklearn.linear_model as linear_model
import sklearn.ensemble as ensemble
import sklearn.feature_selection as feature_selection
import sklearn.metrics as metrics
import sklearn.pipeline as pipeline
import sklearn.preprocessing as preprocessing
import statsmodels.api as sm

import dateutil
import importlib

import seaborn as sns
%config InlineBackend.figure_formats = ['png']
import matplotlib.pyplot as plt
ggplot_styles = {
    'axes.edgecolor': 'white',
    'axes.facecolor': 'white',
    'axes.grid': True,
    'axes.grid.which': 'both',
    'axes.spines.left': False,
    'axes.spines.right': False,
    'axes.spines.top': False,
    'axes.spines.bottom': False,
    'grid.color': 'lightgrey',
    'grid.linewidth': '0.5',
    'xtick.color': '555555',
    'xtick.major.bottom': True,
    'xtick.minor.bottom': False,
    'ytick.color': '555555',
    'ytick.major.left': True,
    'ytick.minor.left': False,
    'figure.figsize': [8,6]
}
plt.rcParams.update(ggplot_styles)

import utilities as utils


# utils.load_data()


ob = pd.read_pickle('orderbook.bz2')
trades = pd.read_pickle('trades.bz2')


if merge_ob_trades:
    ob = ob.drop_duplicates('lastUpdated')
    trades = trades.drop_duplicates('timestamp')


trades['timestamp'] = pd.to_datetime(trades['timestamp'])
ob['lastUpdated'] = pd.to_datetime(ob['lastUpdated'])
trades = trades.set_index('timestamp')
ob = ob.set_index('lastUpdated')


if last_date: ob = ob[ob.index < dateutil.parser.parse(last_date)]

ob


trades


trades[['price','amount']].diff().describe()


pd.Series(ob.index).diff().describe()

count                        96857
mean     0 days 00:00:12.487670689
std      0 days 00:00:23.259605382
min         0 days 00:00:00.012000
25%         0 days 00:00:00.700000
50%         0 days 00:00:07.103000
75%         0 days 00:00:13.707000
max         0 days 00:12:44.853000
Name: lastUpdated, dtype: object


if merge_ob_trades:
    ob_backup = ob.copy()
    trades_backup = trades.copy()
    trades = trades.reindex(ob.index, method='bfill')
    ob['market_price'] = trades['price']
    ob['market_volume'] = trades['amount']
    ob['market_side'] = trades['side'].apply(lambda x: 1 if x=='buy' else 0)


parsed_ob = utils.parse_all_books(ob, maxdepth_parse=maxdepth_parse)
del(ob); gc.collect()

0


if resample_freq > 0:
    parsed_ob = parsed_ob.resample(f'{resample_freq}s').bfill()
    #trades = trades.resample(f'{resample_freq}s').bfill()
parsed_ob


trades.plot(title='Market price', y='price');


avg_ob = parsed_ob.sum()/parsed_ob.count()


side = 'bids'
book = avg_ob[side]

weights = utils.calculate_weights(book)
(book['volume']*weights).plot.bar(title='Side: bid', xlabel='LOB depth', ylabel='Volume');


side = 'asks'
book = avg_ob[side]
(book['volume']*weights).plot.bar(title='Side: ask', xlabel='LOB depth', ylabel='Volume');


((avg_ob['asks']-avg_ob['bids'])['volume']*weights).plot.bar(title='Ask minus Bid', xlabel='LOB depth', ylabel='Volume');


parsed_ob['mids'] = (parsed_ob['asks'] + parsed_ob['bids'])/2.
parsed_ob = utils.calculate_pct_changes(parsed_ob, target_side, n_steps_future, keyword='_pct_')

spread = parsed_ob['asks'] - parsed_ob['bids']
parsed_ob['askbid_ratio'] = parsed_ob['asks']/parsed_ob['bids'] - 1
parsed_ob['imbalance_ratio'] = spread/parsed_ob['mids']


for metric in ['askbid_ratio','imbalance_ratio']:
    for col in ['price','volume']:
        parsed_ob[f'{metric}_{col}_weighted_mean'] = parsed_ob[metric].apply( lambda book: 
            sum(book[col] * utils.calculate_weights(book,type=orderbook_depth_weights)) 
            / sum(utils.calculate_weights(book,type=orderbook_depth_weights)) )


targets = [t for t in parsed_ob.columns.values if '_pct_' in t and 'target' not in t]
for t in targets:
    parsed_ob['target_' + t] = parsed_ob[t]


to_drop = targets + ['mids']
flat_parsed_ob = utils.flatten_all_books(parsed_ob, to_drop=to_drop, 
                                         maxdepth_flatten=maxdepth_flatten)
flat_parsed_ob

Flattening feature: asks
Flattening feature: bids
Flattening feature: askbid_ratio
Flattening feature: imbalance_ratio
Flattening feature: target_mids_pct_0
Flattening feature: target_mids_pct_5


flat_parsed_ob_backup = flat_parsed_ob.copy()
del(parsed_ob); gc.collect()

0


flat_parsed_ob = flat_parsed_ob_backup.copy()


keywords = ['lastUpdated','target','rolling','lag','pctchange','traded','pow','loge','log10','bin']
flat_parsed_ob = utils.feature_engineering(flat_parsed_ob, keywords)


flat_parsed_ob = flat_parsed_ob.replace([np.inf, -np.inf], np.nan)
flat_parsed_ob = flat_parsed_ob.copy().dropna()
flat_parsed_ob.isnull().sum().sum()

0


flat_parsed_ob.dtypes.value_counts()

float64           1066
int64               42
datetime64[ns]       1
dtype: int64


results = sm.tsa.stattools.adfuller(flat_parsed_ob['imbalance_ratio_volume_weighted_mean'])
message = "The series is non-stationary"
if results[1] < 0.05:
    for key, value in results[4].items():
        if abs(results[0]) < abs(value):
            message = f'The series is stationary at a significance level of {key}'
            break
print(message)

The series is non-stationary


results = sm.tsa.stattools.kpss(flat_parsed_ob['imbalance_ratio_volume_weighted_mean'], nlags="auto")
message = "The series is non-stationary"
if results[1] < 0.05:
    for key, value in results[3].items():
        if abs(results[0]) < abs(value):
            message = f'The series is stationary at a significance level of {key}'
            break
print(message)

The series is stationary at a significance level of 2.5%


_ = sm.graphics.tsa.plot_acf(flat_parsed_ob['imbalance_ratio_volume_weighted_mean'], lags=40)


_ = sm.graphics.tsa.plot_pacf(flat_parsed_ob['imbalance_ratio_volume_weighted_mean'], lags=40)


_ = sns.regplot(x='imbalance_ratio_volume_weighted_mean', 
                y=f'target_{target_side}_pct_0_price_level_0',
                data=flat_parsed_ob, line_kws=dict(color='red'));
_.set_xlabel('Volume imbalance ratio (LOB mean)');
_.set_ylabel('Percent price change (one step ahead)');


_ = sns.regplot(x='imbalance_ratio_volume_weighted_mean_rolling_5_mean', 
                y=f'target_{target_side}_pct_{n_steps_future[-1]}_price_level_0',
                data=flat_parsed_ob, line_kws=dict(color='red'));
_.set_xlabel('Volume imbalance ratio (LOB mean), 5 steps rolling average');
_.set_ylabel(f'Percent price change ({n_steps_future[-1]} step ahead)');


_ = sns.regplot(x='askbid_ratio_volume_weighted_mean', 
                y=f'target_{target_side}_pct_5_price_level_0',
                data=flat_parsed_ob, line_kws=dict(color='red'));
_.set_xlabel('Volume ask/bid ratio (LOB mean)');
_.set_ylabel(f'Percent price change ({n_steps_future[-1]} step ahead)');


flat_parsed_ob.plot.scatter(x='imbalance_ratio_volume_weighted_mean', y='imbalance_ratio_price_level_0', 
                            xlabel='Volume imbalance ratio (LOB mean)', ylabel='Bid-ask spread as percentage of mid-price');


X_train, X_test, y_train, y_test = utils.prepare_data(flat_parsed_ob, 
                                                      targets=[f'target_{target_side}_pct_{n_steps_future[-1]}_price_level_0'],
                                                      filter_fun=None)


model = sm.OLS(y_train, X_train)
results = model.fit()
results_summary = results.summary()

results_as_html = results_summary.tables[0].as_html()
pd.read_html(results_as_html, header=0, index_col=0)[0]


results_as_html = results_summary.tables[1].as_html()
pd.read_html(results_as_html, header=0, index_col=0)[0].sort_values(by='P>|t|')


predictions = results.predict(X_test)
prediction_error = y_test - predictions
pd.DataFrame(prediction_error.describe())


_, pvalue = sm.stats.diagnostic.kstest_normal(prediction_error)
if pvalue < 0.05: print("The residuals are not Normally distributed; p-value: " + str(pvalue))

The residuals are not Normally distributed; p-value: 0.0009999999999998899


dw = sm.stats.stattools.durbin_watson(prediction_error, axis=0)
if dw < 1: print("The residuals are positively autocorrelated. Durbin-Watson statistic: " + str(dw))
if dw > 3: print("The residuals are negatively autocorrelated. Durbin-Watson statistic: " + str(dw))

The residuals are positively autocorrelated. Durbin-Watson statistic: 0.571941507029963


prediction_error.plot(kind='hist',bins=1000, title="Prediction error distribution");


_ = sm.qqplot(prediction_error, marker='.');


x = range(0,len(y_test))
fig, ax = plt.subplots()
ax.plot(x,y_test,label='actual');
ax.plot(x,predictions,c='r',label='predicted');
legend = ax.legend(loc='upper left')
plt.show();


_ = sm.graphics.tsa.plot_acf(prediction_error, lags=40)


_ = sm.graphics.tsa.plot_pacf(prediction_error, lags=40)


X_train, X_test, y_train, y_test = utils.prepare_data(flat_parsed_ob, 
                                                      targets=[f'target_{target_side}_pct_{n_steps_future[-1]}_price_level_0'], 
                                                      filter_fun=None, test_size=0.)
n_days = flat_parsed_ob["lastUpdated"].map(pd.Timestamp.date).unique().size
max_train_size = int(np.floor(len(X_train)/n_days))
test_size = int(max_train_size/5)
n_splits = int(len(X_train)/test_size)-1
kfold = model_selection.TimeSeriesSplit(n_splits=n_splits,
                                        test_size=test_size,
                                        max_train_size=max_train_size
                                        )
fig, ax = plt.subplots()
utils.plot_cv_indices(kfold, X_train, y_train, ax);


X_train, X_test, y_train, y_test = utils.prepare_data(flat_parsed_ob, 
                                                      targets=[f'target_{target_side}_pct_{n_steps_future[-1]}_price_level_0'], 
                                                      filter_fun=None, test_size=0.)
scoring = ['r2', 'neg_mean_squared_error']
n_days = flat_parsed_ob["lastUpdated"].map(pd.Timestamp.date).unique().size
scaler = preprocessing.RobustScaler()#MinMaxScaler()
selector = feature_selection.SelectKBest(feature_selection.f_regression, k=20)
models = [('Ridge', pipeline.make_pipeline(scaler, selector, linear_model.Ridge(fit_intercept=False)))]
cv_results, fitted = utils.test_models(X_train, y_train, X_test, y_test, mode='cv',
                                       models=models, scoring=scoring, kfold=kfold)
cv_results.describe()

Fitting model: Ridge


for m in set(cv_results['model']):
    cv_results[cv_results['model']==m][['test_r2','train_r2']].plot(title='Model: ' + m);


X_train, X_test, y_train, y_test = utils.prepare_data(flat_parsed_ob, 
                                                      targets=[f'target_{target_side}_pct_{n_steps_future[-1]}_price_level_0'],
                                                      filter_fun=None, test_size=0)
scoring = metrics.make_scorer(utils.binarize, greater_is_better=True)
n_days = flat_parsed_ob["lastUpdated"].map(pd.Timestamp.date).unique().size
scaler = preprocessing.RobustScaler()
selector = feature_selection.SelectKBest(feature_selection.f_regression, k=30)
models = [('Ridge', pipeline.make_pipeline(scaler, selector, linear_model.Ridge(fit_intercept=False)))]
cv_results, fitted = utils.test_models(X_train, y_train, X_test, y_test, models=models, scoring=scoring, kfold=kfold)
cv_results.describe()

Fitting model: Ridge


for m in set(cv_results['model']):
    cv_results[cv_results['model']==m][['test_score','train_score']].plot(title='Model: ' + m);


X_train, X_test, y_train, y_test = utils.prepare_data(flat_parsed_ob, 
                                                      targets=[f'target_{target_side}_pct_{n_steps_future[-1]}_price_level_0'],
                                                      filter_fun=None, test_size=0)
scoring = ['r2', 'neg_mean_squared_error']
n_days = flat_parsed_ob["lastUpdated"].map(pd.Timestamp.date).unique().size
scaler = preprocessing.MaxAbsScaler()
selector = feature_selection.SelectKBest(feature_selection.f_regression, k=30)
models = [
       ('RF',   pipeline.make_pipeline(scaler, selector, ensemble.RandomForestRegressor(n_jobs=-1))),
       ('GBR', pipeline.make_pipeline(scaler, selector, ensemble.GradientBoostingRegressor())),
       ('XGB', pipeline.make_pipeline(scaler, selector, xgboost.XGBRegressor()))
    ]
cv_results, fitted = utils.test_models(X_train, y_train, X_test, y_test, models=models, scoring=scoring, kfold=kfold)
cv_results.groupby('model').agg([np.mean, np.std]).drop(['fit_time', 'score_time'],axis=1)

Fitting model: RF
Fitting model: GBR
Fitting model: XGB


for m in set(cv_results['model']):
    cv_results[cv_results['model']==m][['test_r2','train_r2']].plot(title='Model: ' + m);


X_train, X_test, y_train, y_test = utils.prepare_data(flat_parsed_ob, 
                                                      targets=[f'target_{target_side}_pct_{n_steps_future[-1]}_price_level_0'],
                                                      filter_fun=utils.discretify, add_constant=False, test_size=0)
scoring = ['accuracy']
scaler = preprocessing.MaxAbsScaler()
selector = feature_selection.SelectKBest(feature_selection.f_classif, k=30)
models = [('LogReg', pipeline.make_pipeline(scaler, selector, linear_model.LogisticRegression(max_iter=1000)))]
cv_results, fitted = utils.test_models(X_train, y_train, X_test, y_test, 
                                       type='classification', models=models,
                                       scoring=scoring, kfold=kfold)
cv_results.describe()

Filtering target
Fitting model: LogReg


for m in set(cv_results['model']):
    cv_results[cv_results['model']==m][['test_accuracy','train_accuracy']].plot(title='Model: ' + m);


std_window=200
prices = trades.drop_duplicates()['price']
std = prices.pct_change().rolling(std_window).std().fillna(0)
prices = prices[std>0]
std = std[std>0]
std.plot(title="Standard deviation");


spread = utils.calculate_spread(0, T=1, gamma=gamma, sigma=std, k=kappa)
spread.plot(title="Spread");


spread = ((spread - spread.min())/(spread.max()-spread.min())).fillna(0)
spread = spread*prices
spread.plot(title="Scaled spread");


ask = prices + spread/2
bid = prices - spread/2
fig, ax = plt.subplots()
ax.plot(prices,linestyle=':',color='lightgrey',label='price');
ax.plot(ask,':r',label='ask');
ax.plot(bid,':g',label='bid');
legend = ax.legend(loc='upper left')
plt.show();


fig, ax = plt.subplots()
ax.plot(prices[-1000:],linestyle=':',color='lightgrey', label='price');
ax.plot(ask[-1000:],'-r',label='ask');
ax.plot(bid[-1000:],'-g',label='bid');
legend = ax.legend(loc='upper left')
plt.show();

	asks	bids
lastUpdated
2020-07-17 00:00:23.225	[[2.644e-05, 5243], [2.645e-05, 4216], [2.646e...	[[2.639e-05, 4216], [2.638e-05, 7000], [2.637e...
2020-07-17 00:00:30.293	[[2.645e-05, 4216], [2.646e-05, 763], [2.647e-...	[[2.638e-05, 4989], [2.637e-05, 2427], [2.6360...
2020-07-17 00:00:30.429	[[2.645e-05, 4216], [2.646e-05, 763], [2.647e-...	[[2.639e-05, 4216], [2.638e-05, 4989], [2.637e...
2020-07-17 00:01:04.450	[[2.644e-05, 11], [2.646e-05, 4978], [2.647e-0...	[[2.639e-05, 4216], [2.638e-05, 6689], [2.637e...
2020-07-17 00:01:10.753	[[2.643e-05, 7], [2.645e-05, 3982], [2.646e-05...	[[2.639e-05, 4216], [2.638e-05, 6689], [2.637e...
...	...	...
2020-07-30 23:58:32.329	[[2.7060000000000002e-05, 7396], [2.707e-05, 2...	[[2.703e-05, 495], [2.7020000000000002e-05, 28...
2020-07-30 23:58:38.233	[[2.709e-05, 46], [2.7099999999999998e-05, 275...	[[2.703e-05, 3251], [2.7020000000000002e-05, 1...
2020-07-30 23:58:44.335	[[2.707e-05, 7395], [2.7099999999999998e-05, 7...	[[2.703e-05, 3251], [2.7020000000000002e-05, 1...
2020-07-30 23:58:44.438	[[2.7099999999999998e-05, 54], [2.711e-05, 255...	[[2.703e-05, 3251], [2.7020000000000002e-05, 1...
2020-07-30 23:59:01.545	[[2.705e-05, 2758], [2.7060000000000002e-05, 6...	[[2.703e-05, 131], [2.7020000000000002e-05, 10...

	number	id	price	side	amount
timestamp
2020-07-17 00:01:10.647	0	19088200	0.00002643	buy	4.0
2020-07-17 00:01:26.820	1	19088201	0.00002640	sell	58.0
2020-07-17 00:01:26.825	2	19088202	0.00002640	sell	25.0
2020-07-17 00:01:26.841	3	19088203	0.00002640	sell	235.0
2020-07-17 00:01:26.842	4	19088204	0.00002640	sell	36.0
...	...	...	...	...	...
2020-07-30 23:54:05.293	96498	19187367	0.00002707	sell	181.0
2020-07-30 23:57:25.885	96499	19187368	0.00002709	buy	167.0
2020-07-30 23:58:00.690	96500	19187369	0.00002706	sell	21.0
2020-07-30 23:58:00.697	96501	19187370	0.00002705	sell	43.0
2020-07-30 23:58:36.580	96502	19187371	0.00002710	buy	1.0

	price	amount
count	96502.000000000000	96502.000000000000
mean	0.000000000007	-0.000031087439
std	0.000000030751	3320.324964702130
min	-0.000000990000	-192662.000000000000
25%	-0.000000010000	-261.750000000000
50%	0.000000000000	0.000000000000
75%	0.000000010000	236.000000000000
max	0.000000960000	192641.000000000000

	asks	bids
lastUpdated
2020-07-17 00:00:23.225	price volume 0 0.00002644 5243 1 ...	price volume 0 0.00002639 4216 1 ...
2020-07-17 00:00:30.293	price volume 0 0.00002645 4216 1 ...	price volume 0 0.00002638 4989 1 ...
2020-07-17 00:00:30.429	price volume 0 0.00002645 4216 1 ...	price volume 0 0.00002639 4216 1 ...
2020-07-17 00:01:04.450	price volume 0 0.00002644 11 1 ...	price volume 0 0.00002639 4216 1 ...
2020-07-17 00:01:10.753	price volume 0 0.00002643 7 1 ...	price volume 0 0.00002639 4216 1 ...
...	...	...
2020-07-30 23:58:32.329	price volume 0 0.00002706 7396 1 ...	price volume 0 0.00002703 495 1 ...
2020-07-30 23:58:38.233	price volume 0 0.00002709 46 1 ...	price volume 0 0.00002703 3251 1 ...
2020-07-30 23:58:44.335	price volume 0 0.00002707 7395 1 ...	price volume 0 0.00002703 3251 1 ...
2020-07-30 23:58:44.438	price volume 0 0.00002710 54 1 ...	price volume 0 0.00002703 3251 1 ...
2020-07-30 23:59:01.545	price volume 0 0.00002705 2758 1 ...	price volume 0 0.00002703 131 1 ...

	lastUpdated	askbid_ratio_price_weighted_mean	askbid_ratio_volume_weighted_mean	imbalance_ratio_price_weighted_mean	imbalance_ratio_volume_weighted_mean	asks_price_level_0	asks_price_level_1	asks_price_level_2	asks_price_level_3	asks_price_level_4	...	target_mids_pct_5_volume_level_0	target_mids_pct_5_volume_level_1	target_mids_pct_5_volume_level_2	target_mids_pct_5_volume_level_3	target_mids_pct_5_volume_level_4	target_mids_pct_5_volume_level_5	target_mids_pct_5_volume_level_6	target_mids_pct_5_volume_level_7	target_mids_pct_5_volume_level_8	target_mids_pct_5_volume_level_9
1	2020-07-17 00:00:30.293	0.006077951195	95.216377457823	0.006057164490	0.129558427753	0.00002645	0.00002646	0.00002647	0.00002648	0.00002649	...	-56.023900054318	900.564263322884	-67.376214715410	-78.312692598407	214.674130347861	-14.386775427705	3729.701230228471	-96.295870240423	-43.095939541437	332.422171602126
2	2020-07-17 00:00:30.429	0.005696064107	12.132931502401	0.005677517033	0.060698331709	0.00002645	0.00002646	0.00002647	0.00002648	0.00002649	...	-49.928842504744	39.481919332406	329.447446785045	-96.502218159930	1397.647058823530	-30.922277132405	1857.344064386318	-0.412532054856	855.312157721796	-93.554220381497
3	2020-07-17 00:01:04.450	0.005658170966	4.807230932065	0.005639702742	-0.269652793336	0.00002644	0.00002646	0.00002647	0.00002648	0.00002649	...	-69.245327655548	32.827633496186	44.568844870277	-20.173162537896	-49.673763736264	-48.751684744240	1746.981891348089	-26.446649570744	23.254160147916	-90.799882801055
4	2020-07-17 00:01:10.753	0.005278591847	4.644906072570	0.005262190319	-0.110178755809	0.00002643	0.00002645	0.00002646	0.00002647	0.00002648	...	-0.047359696898	5.725798894199	81.350438892640	0.000000000000	-67.269681742044	-30.181590482154	49.924204143507	1051.308900523560	-60.211557834967	-86.477174849268
5	2020-07-17 00:01:18.858	0.004897152943	2.013669986961	0.004882686142	-0.372028741073	0.00002643	0.00002645	0.00002646	0.00002647	0.00002648	...	7.180570221753	63.979019272993	-29.339504056679	799.381387842926	-90.418124162281	-98.437500000000	33.296964251332	177.090909090909	-42.061745682470	-90.138190954774
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
96848	2020-07-30 23:58:25.227	0.007229176642	13.649452908530	0.007199312138	-0.418270221678	0.00002710	0.00002712	0.00002713	0.00002717	0.00002718	...	-8.083867210250	-25.700687466949	129.524232992441	1216.624040920716	-76.126404147636	1161.637931034483	-83.653673432729	742.366554684584	-7.888446215139	-46.207515199841
96849	2020-07-30 23:58:25.328	0.006228613124	10.686884864106	0.006205477910	-0.138614138212	0.00002709	0.00002710	0.00002711	0.00002712	0.00002713	...	13.886010362694	-66.522349637257	74.476722060129	-6.863442389758	-17.818930041152	0.000000000000	48.823836905384	0.000000000000	0.000000000000	-13.530349123569
96850	2020-07-30 23:58:25.428	0.005709750993	4.638994660619	0.005689828689	0.265893564738	0.00002708	0.00002709	0.00002710	0.00002711	0.00002712	...	766.938110749186	168.767507002801	-30.515887183149	28.866655732327	-48.277648277648	-79.576008273009	42.849974912193	-3.040293040293	35.144237160456	-72.546223898445
96851	2020-07-30 23:58:26.225	0.006673414751	10.621061081675	0.006647136258	-0.156175782429	0.00002709	0.00002710	0.00002712	0.00002713	0.00002717	...	14.162348877375	-68.921132693658	32.214830757190	-85.213864306785	0.000000000000	0.000000000000	51.038251366120	0.000000000000	-30.709323421390	-13.548012764723
96852	2020-07-30 23:58:31.628	0.006599422890	0.549903308992	0.006573281075	-0.504127771190	0.00002707	0.00002710	0.00002712	0.00002713	0.00002717	...	-11.162361623616	-65.487009515663	-4.101969872538	94.109904573873	-60.640508495294	4795.555555555556	5.101302460203	-10.931613197854	-63.781583255896	-7.248322147651

	y	R-squared:	0.132
Dep. Variable:
Model:	OLS	Adj. R-squared:	0.123
Method:	Least Squares	F-statistic:	15.820
Date:	Sat, 09 Apr 2022	Prob (F-statistic):	0.000
Time:	17:49:00	Log-Likelihood:	52052.000
No. Observations:	59949	AIC:	-103000.000
Df Residuals:	59379	BIC:	-97830.000
Df Model:	569	NaN	NaN
Covariance Type:	nonrobust	NaN	NaN

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	48.8247	7.321	6.669	0.000	34.475	63.175
imbalance_ratio_volume_level_8_lag_8	0.0045	0.001	5.602	0.000	0.003	0.006
imbalance_ratio_volume_level_8_lag_9	0.0044	0.001	5.533	0.000	0.003	0.006
askbid_ratio_price_level_7_pctchange_61	161.5078	39.609	4.078	0.000	83.875	239.141
imbalance_ratio_volume_level_9_lag_1	0.0218	0.004	5.854	0.000	0.014	0.029
...	...	...	...	...	...	...
askbid_ratio_price_level_2_pctchange_201	0.1747	13.462	0.013	0.990	-26.212	26.561
imbalance_ratio_price_level_5_lag_4	0.9421	83.598	0.011	0.991	-162.909	164.794
imbalance_ratio_price_level_6_lag_3	-0.3350	91.093	-0.004	0.997	-178.877	178.207
askbid_ratio_price_level_3_pctchange_81	-0.0549	19.018	-0.003	0.998	-37.330	37.220
imbalance_ratio_price_level_3_pctchange_81	0.0265	19.064	0.001	0.999	-37.339	37.392

	0
count	23194.000000000000
mean	0.002352179778
std	0.219006116152
min	-11.074054726947
25%	-0.067882855451
50%	0.000787590469
75%	0.069043484067
max	3.727768332582

	fit_time	score_time	test_r2	train_r2	test_neg_mean_squared_error	train_neg_mean_squared_error
count	69.000000000000	69.000000000000	69.000000000000	69.000000000000	69.000000000000	69.000000000000
mean	0.333873848984	0.012919432875	0.028822254092	0.070229812273	-0.016471403460	-0.015131589037
std	0.018144384567	0.001662836451	0.059078263947	0.034619643888	0.023563365223	0.013993361944
min	0.223081588745	0.012141942978	-0.131451745510	0.031828395760	-0.175018441188	-0.052642739588
25%	0.333499431610	0.012435436249	-0.006585886034	0.051146566804	-0.019409458100	-0.022903501645
50%	0.335736989975	0.012549638748	0.027211790901	0.057035963721	-0.008815412852	-0.008847181340
75%	0.339479684830	0.012677192688	0.061877578638	0.067794161364	-0.004129877889	-0.004584193482
max	0.360774993896	0.025242328644	0.183978750468	0.177001109424	-0.002426292679	-0.002778878809

	fit_time	score_time	test_score	train_score
count	69.000000000000	69.000000000000	69.000000000000	69.000000000000
mean	0.335588002550	0.013615846634	0.577432354634	0.591096794227
std	0.020117858132	0.002543659147	0.045609588068	0.028600052101
min	0.216948032379	0.010088682175	0.464594127807	0.512001381454
25%	0.333982229233	0.012809753418	0.548359240069	0.579174581247
50%	0.338042259216	0.012974500656	0.577720207254	0.590398894837
75%	0.343518018723	0.013219118118	0.603626943005	0.607839751338
max	0.360073804855	0.028847694397	0.674438687392	0.657226731135

	test_r2		train_r2		test_neg_mean_squared_error		train_neg_mean_squared_error
	mean	std	mean	std	mean	std	mean	std
model
GBR	-0.035045437279	0.126602822863	0.275395881971	0.095452032435	-0.017260935830	0.023765452648	-0.011328787795	0.010029123550
RF	-0.081334531733	0.161601914501	0.860985559525	0.097174713448	-0.017709898251	0.023942569538	-0.002087620088	0.001902349882
XGB	-0.228866571154	0.239549250860	0.810262931564	0.122869576106	-0.019637913009	0.024934347772	-0.002763408650	0.002724918205

	fit_time	score_time	test_accuracy	train_accuracy
count	69.000000000000	69.000000000000	69.000000000000	69.000000000000
mean	0.169621491778	0.011826076369	0.589784986609	0.607050206462
std	0.027328752996	0.000737363203	0.034070475403	0.019191981563
min	0.047887563705	0.011236190796	0.504317789292	0.574339492316
25%	0.153435707092	0.011511325836	0.570811744387	0.594178082192
50%	0.170368194580	0.011587858200	0.589810017271	0.604904161630
75%	0.187912702560	0.011782407761	0.608808290155	0.615783111725
max	0.226779699326	0.015849828720	0.671848013817	0.666551545502

Introduction ¶

Hypotheses and assumptions¶

Historical limit order-book (LOB) probability distribution¶

Definition of fair value¶

Frequency of historical data¶

Methodology¶

Fair value prediction¶

Spread estimation¶

Input parameters, data, and dependencies ¶

Prediction of fair value ¶

Data transformation¶

Parse order books¶

Resample dataframes¶

Visually inspect the data¶

Market price vs time¶

Volume vs LOB depth¶

Feature selection¶

Create new features for each order book¶

Calculate descriptive statistics for each feature¶

Specify targets¶

Flatten order books¶

Create additional features¶

Data analysis¶

Stationarity and autocorrelation checks¶

Plot target vs features¶

Target change vs Volume imbalance¶

Target change vs Ask bid ratio¶

Spread vs Volume imbalance¶

Model selection¶

Prepare data for the analysis¶

Linear regression¶

Residual Analysis¶

Rolling cross validation¶

Accuracy of the prediction of the linear regression¶

Testing of non-linear models¶

Logistic regression as benchmark¶

Spread estimation ¶

Visual inspection of the market, bid, and ask prices¶

Conclusions ¶

Further work¶

References ¶