analyze_trading_data.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. import glob
  6. import json
  7. import re
  8. from datetime import datetime
  9. def extract_timestamp(timestamp_str):
  10. """从时间戳字符串中提取日期时间"""
  11. if isinstance(timestamp_str, tuple):
  12. timestamp_str = timestamp_str[0]
  13. # 使用正则表达式匹配日期时间格式
  14. match = re.search(r'(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})', timestamp_str)
  15. if match:
  16. return match.group(1)
  17. return timestamp_str
  18. def parse_state_flow(state_flow_str):
  19. """解析状态流字符串为Python对象"""
  20. if isinstance(state_flow_str, str):
  21. try:
  22. # 替换单引号为双引号以便JSON解析
  23. state_flow_str = state_flow_str.replace("'", '"')
  24. return json.loads(state_flow_str)
  25. except json.JSONDecodeError:
  26. # 如果无法解析为JSON,尝试使用eval(注意:在生产环境中应避免使用eval)
  27. try:
  28. return eval(state_flow_str)
  29. except:
  30. return state_flow_str
  31. return state_flow_str
  32. def read_xlsx_files(directory='xlsx'):
  33. """读取目录中的所有xlsx文件并合并为一个DataFrame"""
  34. # 获取目录中所有xlsx文件的路径
  35. xlsx_files = glob.glob(os.path.join(directory, '*.xlsx'))
  36. # 排除临时文件
  37. xlsx_files = [f for f in xlsx_files if not os.path.basename(f).startswith('~$')]
  38. if not xlsx_files:
  39. print(f"在 {directory} 目录中未找到xlsx文件")
  40. return None
  41. # 读取并合并所有文件
  42. dfs = []
  43. for file in xlsx_files:
  44. try:
  45. print(f"正在读取文件: {file}")
  46. df = pd.read_excel(file)
  47. dfs.append(df)
  48. except Exception as e:
  49. print(f"读取文件 {file} 时出错: {e}")
  50. if not dfs:
  51. print("没有成功读取任何文件")
  52. return None
  53. # 合并所有DataFrame
  54. combined_df = pd.concat(dfs, ignore_index=True)
  55. # 设置列名
  56. column_names = [
  57. 'pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  58. 'symbol', 'exchangeOutAmount', 'strategy', 'queryPriceUrl',
  59. 'id', 'profit', 'creationTime', 'stateFlow', 'currentState'
  60. ]
  61. # 如果列数不匹配,进行调整
  62. if len(combined_df.columns) == len(column_names):
  63. combined_df.columns = column_names
  64. else:
  65. print(f"警告: 列数不匹配。文件列数: {len(combined_df.columns)}, 预期列数: {len(column_names)}")
  66. # 尝试使用前N列
  67. combined_df.columns = column_names[:len(combined_df.columns)]
  68. return combined_df
  69. def read_xlsx_file(file_path):
  70. """读取指定的xlsx文件并返回一个DataFrame"""
  71. if not os.path.exists(file_path):
  72. print(f"文件不存在: {file_path}")
  73. return None
  74. try:
  75. print(f"正在读取文件: {file_path}")
  76. df = pd.read_excel(file_path)
  77. except Exception as e:
  78. print(f"读取文件 {file_path} 时出错: {e}")
  79. return None
  80. # 设置列名
  81. column_names = [
  82. 'pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  83. 'symbol', 'exchangeOutAmount', 'strategy', 'queryPriceUrl',
  84. 'id', 'profit', 'creationTime', 'stateFlow', 'currentState'
  85. ]
  86. # 如果列数不匹配,进行调整
  87. if len(df.columns) == len(column_names):
  88. df.columns = column_names
  89. else:
  90. print(f"警告: 列数不匹配。文件列数: {len(df.columns)}, 预期列数: {len(column_names)}")
  91. # 尝试使用前N列
  92. df.columns = column_names[:len(df.columns)]
  93. return df
  94. def preprocess_data(df):
  95. """数据预处理"""
  96. if df is None or df.empty:
  97. print("没有数据可供处理")
  98. return None
  99. # 创建副本避免警告
  100. processed_df = df.copy()
  101. # 转换数值列
  102. numeric_columns = ['pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  103. 'exchangeOutAmount', 'profit']
  104. for col in numeric_columns:
  105. if col in processed_df.columns:
  106. processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
  107. # 处理时间戳
  108. if 'creationTime' in processed_df.columns:
  109. processed_df['creationTime'] = processed_df['creationTime'].apply(extract_timestamp)
  110. processed_df['creationTime'] = pd.to_datetime(processed_df['creationTime'], errors='coerce')
  111. # 计算价差百分比
  112. if all(col in processed_df.columns for col in ['cexPrice', 'dexPrice']):
  113. processed_df['price_diff_pct'] = (processed_df['cexPrice'] - processed_df['dexPrice']) / processed_df['dexPrice'] * 100
  114. # 计算交易金额
  115. if all(col in processed_df.columns for col in ['exchangeOutAmount', 'cexPrice']):
  116. processed_df['trade_value'] = processed_df['exchangeOutAmount'] * processed_df['cexPrice']
  117. return processed_df
  118. def analyze_data(df):
  119. """基础数据分析"""
  120. if df is None or df.empty:
  121. print("没有数据可供分析")
  122. return None
  123. results = {}
  124. # 基础统计分析
  125. numeric_columns = ['pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  126. 'exchangeOutAmount', 'profit', 'price_diff_pct', 'trade_value']
  127. # 计算基础统计量
  128. stats = df[numeric_columns].describe()
  129. results['basic_stats'] = stats
  130. # 按交易对分组统计
  131. if 'symbol' in df.columns:
  132. symbol_stats = df.groupby('symbol')['profit'].agg(['count', 'sum', 'mean', 'std']).reset_index()
  133. symbol_stats = symbol_stats.sort_values('sum', ascending=False)
  134. results['symbol_stats'] = symbol_stats
  135. # 按策略分组统计
  136. if 'strategy' in df.columns:
  137. strategy_stats = df.groupby('strategy')['profit'].agg(['count', 'sum', 'mean', 'std']).reset_index()
  138. strategy_stats = strategy_stats.sort_values('sum', ascending=False)
  139. results['strategy_stats'] = strategy_stats
  140. # 按状态分组统计
  141. if 'currentState' in df.columns:
  142. state_stats = df.groupby('currentState')['profit'].agg(['count', 'sum', 'mean']).reset_index()
  143. results['state_stats'] = state_stats
  144. # 时间序列分析
  145. if 'creationTime' in df.columns and pd.api.types.is_datetime64_any_dtype(df['creationTime']):
  146. # 按日期分组
  147. df['date'] = df['creationTime'].dt.date
  148. daily_stats = df.groupby('date')['profit'].agg(['count', 'sum', 'mean']).reset_index()
  149. results['daily_stats'] = daily_stats
  150. # 按小时分组
  151. df['hour'] = df['creationTime'].dt.hour
  152. hourly_stats = df.groupby('hour')['profit'].agg(['count', 'sum', 'mean']).reset_index()
  153. results['hourly_stats'] = hourly_stats
  154. # 相关性分析
  155. if len(numeric_columns) > 1:
  156. correlation = df[numeric_columns].corr()
  157. results['correlation'] = correlation
  158. return results
  159. def visualize_data(df, results, output_dir='analysis_results'):
  160. """数据可视化"""
  161. if df is None or df.empty:
  162. print("没有数据可供可视化")
  163. return
  164. # 创建输出目录
  165. os.makedirs(output_dir, exist_ok=True)
  166. # 1. 利润分布直方图
  167. if 'profit' in df.columns:
  168. plt.figure(figsize=(10, 6))
  169. plt.hist(df['profit'].dropna(), bins=30, alpha=0.7)
  170. plt.title('利润分布')
  171. plt.xlabel('利润')
  172. plt.ylabel('频率')
  173. plt.grid(True, alpha=0.3)
  174. plt.savefig(os.path.join(output_dir, 'profit_distribution.png'))
  175. plt.close()
  176. # 2. 价差百分比与利润的散点图
  177. if all(col in df.columns for col in ['price_diff_pct', 'profit']):
  178. plt.figure(figsize=(10, 6))
  179. plt.scatter(df['price_diff_pct'], df['profit'], alpha=0.5)
  180. plt.title('价差百分比与利润的关系')
  181. plt.xlabel('价差百分比 (%)')
  182. plt.ylabel('利润')
  183. plt.grid(True, alpha=0.3)
  184. plt.savefig(os.path.join(output_dir, 'price_diff_vs_profit.png'))
  185. plt.close()
  186. # 3. 每日利润趋势
  187. if 'daily_stats' in results:
  188. daily_stats = results['daily_stats']
  189. plt.figure(figsize=(12, 6))
  190. plt.plot(daily_stats['date'], daily_stats['sum'], marker='o', linestyle='-')
  191. plt.title('每日总利润趋势')
  192. plt.xlabel('日期')
  193. plt.ylabel('总利润')
  194. plt.grid(True, alpha=0.3)
  195. plt.xticks(rotation=45)
  196. plt.tight_layout()
  197. plt.savefig(os.path.join(output_dir, 'daily_profit_trend.png'))
  198. plt.close()
  199. # 4. 按交易对的利润总和条形图
  200. if 'symbol_stats' in results:
  201. symbol_stats = results['symbol_stats']
  202. plt.figure(figsize=(12, 8))
  203. plt.barh(symbol_stats['symbol'], symbol_stats['sum'])
  204. plt.title('各交易对总利润')
  205. plt.xlabel('总利润')
  206. plt.ylabel('交易对')
  207. plt.grid(True, alpha=0.3)
  208. plt.tight_layout()
  209. plt.savefig(os.path.join(output_dir, 'symbol_profit.png'))
  210. plt.close()
  211. # 5. 相关性热图
  212. if 'correlation' in results:
  213. correlation = results['correlation']
  214. plt.figure(figsize=(12, 10))
  215. plt.imshow(correlation, cmap='coolwarm', vmin=-1, vmax=1)
  216. plt.colorbar()
  217. plt.title('特征相关性热图')
  218. plt.xticks(range(len(correlation.columns)), correlation.columns, rotation=90)
  219. plt.yticks(range(len(correlation.columns)), correlation.columns)
  220. # 在热图上添加相关系数值
  221. for i in range(len(correlation.columns)):
  222. for j in range(len(correlation.columns)):
  223. plt.text(j, i, f'{correlation.iloc[i, j]:.2f}',
  224. ha='center', va='center',
  225. color='white' if abs(correlation.iloc[i, j]) > 0.5 else 'black')
  226. plt.tight_layout()
  227. plt.savefig(os.path.join(output_dir, 'correlation_heatmap.png'))
  228. plt.close()
  229. # 6. 按小时的平均利润
  230. if 'hourly_stats' in results:
  231. hourly_stats = results['hourly_stats']
  232. plt.figure(figsize=(12, 6))
  233. plt.bar(hourly_stats['hour'], hourly_stats['mean'])
  234. plt.title('各小时平均利润')
  235. plt.xlabel('小时')
  236. plt.ylabel('平均利润')
  237. plt.grid(True, alpha=0.3)
  238. plt.xticks(range(24))
  239. plt.tight_layout()
  240. plt.savefig(os.path.join(output_dir, 'hourly_avg_profit.png'))
  241. plt.close()
  242. def generate_report(results, output_dir='analysis_results'):
  243. """生成分析报告"""
  244. if not results:
  245. print("没有结果可供生成报告")
  246. return
  247. # 创建输出目录
  248. os.makedirs(output_dir, exist_ok=True)
  249. # 生成HTML报告
  250. html_content = """
  251. <!DOCTYPE html>
  252. <html>
  253. <head>
  254. <title>交易数据分析报告</title>
  255. <style>
  256. body { font-family: Arial, sans-serif; margin: 20px; }
  257. h1, h2 { color: #333; }
  258. table { border-collapse: collapse; width: 100%; margin-bottom: 20px; }
  259. th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
  260. th { background-color: #f2f2f2; }
  261. tr:nth-child(even) { background-color: #f9f9f9; }
  262. .image-container { margin: 20px 0; }
  263. .image-container img { max-width: 100%; height: auto; }
  264. </style>
  265. </head>
  266. <body>
  267. <h1>交易数据分析报告</h1>
  268. <p>生成时间: """ + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + """</p>
  269. """
  270. # 添加基础统计信息
  271. if 'basic_stats' in results:
  272. html_content += """
  273. <h2>基础统计信息</h2>
  274. <table>
  275. <tr>
  276. <th>指标</th>
  277. """
  278. for col in results['basic_stats'].columns:
  279. html_content += f"<th>{col}</th>"
  280. html_content += "</tr>"
  281. for idx, row in results['basic_stats'].iterrows():
  282. html_content += f"<tr><td>{idx}</td>"
  283. for col in results['basic_stats'].columns:
  284. html_content += f"<td>{row[col]:.4f}</td>"
  285. html_content += "</tr>"
  286. html_content += "</table>"
  287. # 添加交易对统计
  288. if 'symbol_stats' in results:
  289. html_content += """
  290. <h2>交易对统计</h2>
  291. <table>
  292. <tr>
  293. <th>交易对</th>
  294. <th>交易次数</th>
  295. <th>总利润</th>
  296. <th>平均利润</th>
  297. <th>标准差</th>
  298. </tr>
  299. """
  300. for _, row in results['symbol_stats'].iterrows():
  301. html_content += f"""
  302. <tr>
  303. <td>{row['symbol']}</td>
  304. <td>{row['count']}</td>
  305. <td>{row['sum']:.4f}</td>
  306. <td>{row['mean']:.4f}</td>
  307. <td>{row['std']:.4f}</td>
  308. </tr>
  309. """
  310. html_content += "</table>"
  311. # 添加策略统计
  312. if 'strategy_stats' in results:
  313. html_content += """
  314. <h2>策略统计</h2>
  315. <table>
  316. <tr>
  317. <th>策略</th>
  318. <th>交易次数</th>
  319. <th>总利润</th>
  320. <th>平均利润</th>
  321. <th>标准差</th>
  322. </tr>
  323. """
  324. for _, row in results['strategy_stats'].iterrows():
  325. html_content += f"""
  326. <tr>
  327. <td>{row['strategy']}</td>
  328. <td>{row['count']}</td>
  329. <td>{row['sum']:.4f}</td>
  330. <td>{row['mean']:.4f}</td>
  331. <td>{row['std']:.4f}</td>
  332. </tr>
  333. """
  334. html_content += "</table>"
  335. # 添加状态统计
  336. if 'state_stats' in results:
  337. html_content += """
  338. <h2>状态统计</h2>
  339. <table>
  340. <tr>
  341. <th>状态</th>
  342. <th>交易次数</th>
  343. <th>总利润</th>
  344. <th>平均利润</th>
  345. </tr>
  346. """
  347. for _, row in results['state_stats'].iterrows():
  348. html_content += f"""
  349. <tr>
  350. <td>{row['currentState']}</td>
  351. <td>{row['count']}</td>
  352. <td>{row['sum']:.4f}</td>
  353. <td>{row['mean']:.4f}</td>
  354. </tr>
  355. """
  356. html_content += "</table>"
  357. # 添加每日统计
  358. if 'daily_stats' in results:
  359. html_content += """
  360. <h2>每日统计</h2>
  361. <table>
  362. <tr>
  363. <th>日期</th>
  364. <th>交易次数</th>
  365. <th>总利润</th>
  366. <th>平均利润</th>
  367. </tr>
  368. """
  369. for _, row in results['daily_stats'].iterrows():
  370. html_content += f"""
  371. <tr>
  372. <td>{row['date']}</td>
  373. <td>{row['count']}</td>
  374. <td>{row['sum']:.4f}</td>
  375. <td>{row['mean']:.4f}</td>
  376. </tr>
  377. """
  378. html_content += "</table>"
  379. # 添加图表
  380. html_content += """
  381. <h2>数据可视化</h2>
  382. <div class="image-container">
  383. <h3>利润分布</h3>
  384. <img src="profit_distribution.png" alt="利润分布">
  385. </div>
  386. <div class="image-container">
  387. <h3>价差百分比与利润的关系</h3>
  388. <img src="price_diff_vs_profit.png" alt="价差百分比与利润的关系">
  389. </div>
  390. <div class="image-container">
  391. <h3>每日总利润趋势</h3>
  392. <img src="daily_profit_trend.png" alt="每日总利润趋势">
  393. </div>
  394. <div class="image-container">
  395. <h3>各交易对总利润</h3>
  396. <img src="symbol_profit.png" alt="各交易对总利润">
  397. </div>
  398. <div class="image-container">
  399. <h3>特征相关性热图</h3>
  400. <img src="correlation_heatmap.png" alt="特征相关性热图">
  401. </div>
  402. <div class="image-container">
  403. <h3>各小时平均利润</h3>
  404. <img src="hourly_avg_profit.png" alt="各小时平均利润">
  405. </div>
  406. """
  407. # 结束HTML
  408. html_content += """
  409. </body>
  410. </html>
  411. """
  412. # 写入HTML文件
  413. with open(os.path.join(output_dir, 'analysis_report.html'), 'w', encoding='utf-8') as f:
  414. f.write(html_content)
  415. print(f"分析报告已生成: {os.path.join(output_dir, 'analysis_report.html')}")
  416. def main():
  417. """主函数"""
  418. import argparse
  419. parser = argparse.ArgumentParser(description='交易数据分析脚本')
  420. parser.add_argument('file_path', type=str, help='要分析的XLSX文件的路径')
  421. args = parser.parse_args()
  422. print(f"开始读取XLSX文件: {args.file_path}")
  423. df = read_xlsx_file(args.file_path)
  424. if df is not None and not df.empty:
  425. print(f"成功读取数据,共 {len(df)} 行")
  426. print("正在预处理数据...")
  427. processed_df = preprocess_data(df)
  428. print("正在分析数据...")
  429. results = analyze_data(processed_df)
  430. print("正在生成可视化图表...")
  431. visualize_data(processed_df, results)
  432. print("正在生成分析报告...")
  433. generate_report(results)
  434. print("分析完成!")
  435. else:
  436. print("没有数据可供分析")
  437. if __name__ == "__main__":
  438. main()