Feature Engineering for Algorithmic Trading
Features are the fuel for machine learning models. In trading, good features can mean the difference between a profitable strategy and random noise.
Categories of Features
- Price-based features - derived from OHLCV
- Technical indicators - classical TA
- Microstructure features - order book, trades
- Alternative data - sentiment, fundamentals
- Time features - seasonality, calendar effects
Price-Based Features
Returns
Volatility
Historical volatility (realized):
Parkinson volatility (using high-low):
1import pandas as pd
2import numpy as np
3
4def calculate_returns(df):
5 """Calculate various return measures."""
6 df['returns'] = df['close'].pct_change()
7 df['log_returns'] = np.log(df['close'] / df['close'].shift(1))
8
9 # Multi-period returns
10 for period in [5, 10, 20]:
11 df[f'returns_{period}d'] = df['close'].pct_change(period)
12
13 return df
14
15
16def calculate_volatility(df, windows=[5, 10, 20, 60]):
17 """Calculate volatility features."""
18 for window in windows:
19 # Standard deviation of returns
20 df[f'volatility_{window}d'] = df['returns'].rolling(window).std()
21
22 # Parkinson volatility
23 df[f'parkinson_vol_{window}d'] = np.sqrt(
24 (1 / (4 * np.log(2))) *
25 ((np.log(df['high'] / df['low']) ** 2).rolling(window).mean())
26 )
27
28 return dfTechnical Indicators
Moving Averages
Simple Moving Average:
Exponential Moving Average:
Where
RSI (Relative Strength Index)
Where
MACD
1def add_technical_indicators(df):
2 """Add common technical indicators."""
3
4 # Moving averages
5 for window in [10, 20, 50, 200]:
6 df[f'sma_{window}'] = df['close'].rolling(window).mean()
7 df[f'ema_{window}'] = df['close'].ewm(span=window).mean()
8
9 # Price relative to MA
10 df[f'close_to_sma_{window}'] = df['close'] / df[f'sma_{window}'] - 1
11
12 # RSI
13 delta = df['close'].diff()
14 gain = (delta.where(delta > 0, 0)).rolling(14).mean()
15 loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
16 rs = gain / loss
17 df['rsi_14'] = 100 - (100 / (1 + rs))
18
19 # MACD
20 ema_12 = df['close'].ewm(span=12).mean()
21 ema_26 = df['close'].ewm(span=26).mean()
22 df['macd'] = ema_12 - ema_26
23 df['macd_signal'] = df['macd'].ewm(span=9).mean()
24 df['macd_hist'] = df['macd'] - df['macd_signal']
25
26 # Bollinger Bands
27 df['bb_mid'] = df['close'].rolling(20).mean()
28 df['bb_std'] = df['close'].rolling(20).std()
29 df['bb_upper'] = df['bb_mid'] + 2 * df['bb_std']
30 df['bb_lower'] = df['bb_mid'] - 2 * df['bb_std']
31 df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
32
33 # ATR (Average True Range)
34 high_low = df['high'] - df['low']
35 high_close = abs(df['high'] - df['close'].shift())
36 low_close = abs(df['low'] - df['close'].shift())
37 tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
38 df['atr_14'] = tr.rolling(14).mean()
39
40 return dfMicrostructure Features
Order book and trade data reveal information about supply and demand.
Order Book Imbalance
Trade Imbalance
VWAP Deviation
1def calculate_microstructure_features(df, trades_df=None):
2 """Calculate microstructure-based features."""
3
4 # VWAP
5 df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
6 df['vwap_dev'] = df['close'] / df['vwap'] - 1
7
8 # Volume features
9 df['volume_sma_20'] = df['volume'].rolling(20).mean()
10 df['relative_volume'] = df['volume'] / df['volume_sma_20']
11
12 # Price-volume correlation
13 df['pv_corr_20'] = df['returns'].rolling(20).corr(df['volume'].pct_change())
14
15 # On-Balance Volume
16 df['obv'] = (np.sign(df['close'].diff()) * df['volume']).cumsum()
17 df['obv_ema'] = df['obv'].ewm(span=20).mean()
18
19 # Money Flow Index
20 typical_price = (df['high'] + df['low'] + df['close']) / 3
21 money_flow = typical_price * df['volume']
22
23 positive_flow = money_flow.where(typical_price > typical_price.shift(), 0).rolling(14).sum()
24 negative_flow = money_flow.where(typical_price < typical_price.shift(), 0).rolling(14).sum()
25
26 df['mfi'] = 100 - (100 / (1 + positive_flow / negative_flow))
27
28 return dfAlternative Data Features
Sentiment Features
1def calculate_sentiment_features(df, sentiment_df):
2 """
3 Merge sentiment data with price data.
4
5 sentiment_df should have: date, sentiment_score, sentiment_volume
6 """
7 # Merge on date
8 df = df.merge(sentiment_df, on='date', how='left')
9
10 # Forward fill missing sentiment
11 df['sentiment_score'] = df['sentiment_score'].fillna(method='ffill')
12
13 # Sentiment momentum
14 df['sentiment_ma_7'] = df['sentiment_score'].rolling(7).mean()
15 df['sentiment_change'] = df['sentiment_score'].diff()
16
17 # Sentiment-return divergence
18 df['sent_ret_corr'] = df['sentiment_score'].rolling(20).corr(df['returns'])
19
20 return dfTime Features
Markets have cyclical patterns based on time.
1def add_time_features(df):
2 """Add calendar and time-based features."""
3
4 df['date'] = pd.to_datetime(df['date'])
5
6 # Day of week (0=Monday)
7 df['day_of_week'] = df['date'].dt.dayofweek
8
9 # Month
10 df['month'] = df['date'].dt.month
11
12 # Quarter end effect
13 df['is_quarter_end'] = df['date'].dt.is_quarter_end.astype(int)
14
15 # Days to month end
16 df['days_to_month_end'] = (df['date'] + pd.offsets.MonthEnd(0) - df['date']).dt.days
17
18 # Cyclical encoding (for neural networks)
19 df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 5)
20 df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 5)
21 df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
22 df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
23
24 return dfFeature Selection
Not all features are useful. Use these techniques to select the best:
Correlation Analysis
1def analyze_feature_correlation(df, target_col, threshold=0.8):
2 """
3 Analyze features for correlation with target and multicollinearity.
4 """
5 # Correlation with target
6 target_corr = df.corr()[target_col].sort_values(ascending=False)
7 print("Top features correlated with target:")
8 print(target_corr.head(20))
9
10 # Feature correlation matrix
11 corr_matrix = df.corr().abs()
12
13 # Find highly correlated features
14 upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
15 high_corr = [(col, row, corr_matrix.loc[row, col])
16 for col in upper.columns
17 for row in upper.index
18 if upper.loc[row, col] > threshold]
19
20 print(f"\nHighly correlated feature pairs (>{threshold}):")
21 for col, row, corr in sorted(high_corr, key=lambda x: -x[2]):
22 print(f" {col} - {row}: {corr:.3f}")
23
24 return target_corr, high_corrFeature Importance
1from sklearn.ensemble import RandomForestClassifier
2from sklearn.inspection import permutation_importance
3
4def calculate_feature_importance(X, y):
5 """Calculate feature importance using Random Forest."""
6
7 # Train model
8 rf = RandomForestClassifier(n_estimators=100, random_state=42)
9 rf.fit(X, y)
10
11 # Built-in importance
12 importance_df = pd.DataFrame({
13 'feature': X.columns,
14 'importance': rf.feature_importances_
15 }).sort_values('importance', ascending=False)
16
17 # Permutation importance (more robust)
18 perm_importance = permutation_importance(rf, X, y, n_repeats=10, random_state=42)
19
20 perm_importance_df = pd.DataFrame({
21 'feature': X.columns,
22 'importance_mean': perm_importance.importances_mean,
23 'importance_std': perm_importance.importances_std
24 }).sort_values('importance_mean', ascending=False)
25
26 return importance_df, perm_importance_dfComplete Feature Engineering Pipeline
1def build_feature_matrix(df):
2 """
3 Complete feature engineering pipeline.
4 """
5 df = df.copy()
6
7 # Price features
8 df = calculate_returns(df)
9 df = calculate_volatility(df)
10
11 # Technical indicators
12 df = add_technical_indicators(df)
13
14 # Microstructure
15 df = calculate_microstructure_features(df)
16
17 # Time features
18 df = add_time_features(df)
19
20 # Target variable (next day return direction)
21 df['target'] = (df['returns'].shift(-1) > 0).astype(int)
22
23 # Drop rows with NaN
24 df = df.dropna()
25
26 # Feature columns
27 feature_cols = [col for col in df.columns
28 if col not in ['date', 'open', 'high', 'low', 'close',
29 'volume', 'target', 'returns']]
30
31 return df, feature_cols
32
33
34# Usage
35df, features = build_feature_matrix(price_data)
36X = df[features]
37y = df['target']
38
39print(f"Feature matrix shape: {X.shape}")
40print(f"Features: {features}")Key Takeaways
- Returns and volatility are fundamental - always include them
- Technical indicators capture classical patterns
- Microstructure features reveal supply/demand dynamics
- Time features capture seasonality
- Remove multicollinearity to improve model stability
- Use rolling calculations to avoid look-ahead bias
- Feature importance guides selection
Good features are more important than complex models. Spend time on feature engineering!