code:
# -*- coding: utf-8 -*-
'''
完整的决策树预测3D彩票涨跌的源代码,并加入决策树规则。
'''
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# 获取彩票历史数据
def get_fc3d_data():
# 读取文本文件(开奖文件),分隔符为空格,第一列为索引
df = pd.read_csv(r'3d_asc.txt',delimiter=' ',index_col=0)
# 删除不需要的数据,按列删除
df.drop(df.columns[[4,5,6,7,8,9,10,11,12,13,14,15]], axis=1, inplace=True)#删除多余列
# 设置各列的名称
df.columns = ['日期', '百', '十','个']
# 求和值
df['和值']=df['百']+df['十']+df['个']
# 遍历每一行,处理并新增列
for index, row in df.iterrows():
df.loc[index, '跨度'] =np.abs(row['百'] - row['十'])+np.abs(row['百'] - row['个'])+np.abs(row['个'] - row['十'])
return df
# 特征工程
def create_features(df):
# 基础和值特征
df['pct_change'] = df['和值'].pct_change()
df['ma5'] = df['和值'].rolling(5).mean()
df['ma10'] = df['和值'].rolling(10).mean()
# 动量指标
df['rsi'] = calculate_rsi(df['和值'])
# 跨度关系
df['vol_ma5'] = df['跨度'].rolling(5).mean()
df['kua_he_corr'] = df['和值'].rolling(10).corr(df['跨度'])
# 标记大小标签(1=大, 0=小)
df['label'] = np.where(df['pct_change'].shift(-1) > 0, 1, 0)
return df.dropna()
# 计算RSI指标
def calculate_rsi(prices, window=14):
delta = prices.diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.rolling(window).mean()
avg_loss = loss.rolling(window).mean()
rs = avg_gain / avg_loss
return 100 - (100 / (1 + rs))
# 决策树规则可视化
def show_rules(tree_model, feature_names):
tree_rules = export_text(
tree_model,
feature_names=list(feature_names),
max_depth=3
)
print("决策树核心规则:\n", tree_rules)
if __name__ == '__main__':
# 数据准备
stock_data = get_fc3d_data()
df = create_features(stock_data)
# 特征选择
features = ['ma5', 'ma10', 'rsi', 'vol_ma5', 'kua_he_corr']
X = df[features]
y = df['label']
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 构建决策树模型
dt_model = DecisionTreeClassifier(
max_depth=5,
min_samples_split=20,
criterion='gini',
random_state=42
)
dt_model.fit(X_train, y_train)
# 模型评估
y_pred = dt_model.predict(X_test)
print("测试集准确率:", accuracy_score(y_test, y_pred))
# 输出决策规则
show_rules(dt_model, features)
# 特征重要性可视化
plt.figure(figsize=(10,5))
pd.Series(dt_model.feature_importances_, index=features)\
.sort_values().plot(kind='barh')
plt.title('Feature Importance')
plt.show()
使用pd.read_csv(r'3d_asc.txt')数据,计算5/10日均线、RSI等技术指标作为特征,决策树通过基尼系数选择最优分割点,最终输出可解释的和值大小的开出规则。