analyze_trading_data.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. import glob
  6. import json
  7. import re
  8. from datetime import datetime
  9. # 配置matplotlib中文字体显示
  10. plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans'] # 用来正常显示中文标签
  11. plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
  12. def extract_timestamp(timestamp_str):
  13. """从时间戳字符串中提取日期时间"""
  14. if isinstance(timestamp_str, tuple):
  15. timestamp_str = timestamp_str[0]
  16. # 使用正则表达式匹配日期时间格式
  17. match = re.search(r'(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})', timestamp_str)
  18. if match:
  19. return match.group(1)
  20. return timestamp_str
  21. def parse_state_flow(state_flow_str):
  22. """解析状态流字符串为Python对象"""
  23. if isinstance(state_flow_str, str):
  24. try:
  25. # 替换单引号为双引号以便JSON解析
  26. state_flow_str = state_flow_str.replace("'", '"')
  27. return json.loads(state_flow_str)
  28. except json.JSONDecodeError:
  29. # 如果无法解析为JSON,尝试使用eval(注意:在生产环境中应避免使用eval)
  30. try:
  31. return eval(state_flow_str)
  32. except:
  33. return state_flow_str
  34. return state_flow_str
  35. def read_xlsx_files(directory='xlsx'):
  36. """读取目录中的所有xlsx文件并合并为一个DataFrame"""
  37. # 获取目录中所有xlsx文件的路径
  38. xlsx_files = glob.glob(os.path.join(directory, '*.xlsx'))
  39. # 排除临时文件
  40. xlsx_files = [f for f in xlsx_files if not os.path.basename(f).startswith('~$')]
  41. if not xlsx_files:
  42. print(f"在 {directory} 目录中未找到xlsx文件")
  43. return None
  44. # 读取并合并所有文件
  45. dfs = []
  46. for file in xlsx_files:
  47. try:
  48. print(f"正在读取文件: {file}")
  49. df = pd.read_excel(file)
  50. dfs.append(df)
  51. except Exception as e:
  52. print(f"读取文件 {file} 时出错: {e}")
  53. if not dfs:
  54. print("没有成功读取任何文件")
  55. return None
  56. # 合并所有DataFrame
  57. combined_df = pd.concat(dfs, ignore_index=True)
  58. # 设置列名
  59. column_names = [
  60. 'pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  61. 'symbol', 'exchangeOutAmount', 'strategy', 'queryPriceUrl',
  62. 'id', 'profit', 'creationTime', 'stateFlow', 'currentState'
  63. ]
  64. # 如果列数不匹配,进行调整
  65. if len(combined_df.columns) == len(column_names):
  66. combined_df.columns = column_names
  67. else:
  68. print(f"警告: 列数不匹配。文件列数: {len(combined_df.columns)}, 预期列数: {len(column_names)}")
  69. # 尝试使用前N列
  70. combined_df.columns = column_names[:len(combined_df.columns)]
  71. return combined_df
  72. def read_xlsx_file(file_path):
  73. """读取指定的xlsx文件并返回一个DataFrame"""
  74. if not os.path.exists(file_path):
  75. print(f"文件不存在: {file_path}")
  76. return None
  77. try:
  78. print(f"正在读取文件: {file_path}")
  79. df = pd.read_excel(file_path)
  80. except Exception as e:
  81. print(f"读取文件 {file_path} 时出错: {e}")
  82. return None
  83. # 设置列名
  84. column_names = [
  85. 'pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  86. 'symbol', 'exchangeOutAmount', 'strategy', 'queryPriceUrl',
  87. 'id', 'profit', 'creationTime', 'stateFlow', 'currentState'
  88. ]
  89. # 如果列数不匹配,进行调整
  90. if len(df.columns) == len(column_names):
  91. df.columns = column_names
  92. else:
  93. print(f"警告: 列数不匹配。文件列数: {len(df.columns)}, 预期列数: {len(column_names)}")
  94. # 尝试使用前N列
  95. df.columns = column_names[:len(df.columns)]
  96. return df
  97. def preprocess_data(df):
  98. """数据预处理"""
  99. if df is None or df.empty:
  100. print("没有数据可供处理")
  101. return None
  102. # 创建副本避免警告
  103. processed_df = df.copy()
  104. # 转换数值列
  105. numeric_columns = ['pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  106. 'exchangeOutAmount', 'profit']
  107. for col in numeric_columns:
  108. if col in processed_df.columns:
  109. processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
  110. # 处理时间戳
  111. if 'creationTime' in processed_df.columns:
  112. processed_df['creationTime'] = processed_df['creationTime'].apply(extract_timestamp)
  113. processed_df['creationTime'] = pd.to_datetime(processed_df['creationTime'], errors='coerce')
  114. # 计算价差百分比
  115. if all(col in processed_df.columns for col in ['cexPrice', 'dexPrice']):
  116. processed_df['price_diff_pct'] = (processed_df['cexPrice'] - processed_df['dexPrice']) / processed_df['dexPrice'] * 100
  117. # 计算交易金额
  118. if all(col in processed_df.columns for col in ['exchangeOutAmount', 'cexPrice']):
  119. processed_df['trade_value'] = processed_df['exchangeOutAmount'] * processed_df['cexPrice']
  120. return processed_df
  121. def analyze_data(df):
  122. """基础数据分析"""
  123. if df is None or df.empty:
  124. print("没有数据可供分析")
  125. return None
  126. results = {}
  127. # 数值列定义(用于其他分析)
  128. numeric_columns = ['pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice',
  129. 'exchangeOutAmount', 'profit', 'price_diff_pct', 'trade_value']
  130. # 按交易对分组统计
  131. if 'symbol' in df.columns:
  132. symbol_stats = df.groupby('symbol')['profit'].agg(['count', 'sum', 'mean', 'std']).reset_index()
  133. symbol_stats = symbol_stats.sort_values('sum', ascending=False)
  134. results['symbol_stats'] = symbol_stats
  135. # 按策略分组统计
  136. if 'strategy' in df.columns:
  137. strategy_stats = df.groupby('strategy')['profit'].agg(['count', 'sum', 'mean', 'std']).reset_index()
  138. strategy_stats = strategy_stats.sort_values('sum', ascending=False)
  139. results['strategy_stats'] = strategy_stats
  140. # 按状态分组统计
  141. if 'currentState' in df.columns:
  142. state_stats = df.groupby('currentState')['profit'].agg(['count', 'sum', 'mean']).reset_index()
  143. results['state_stats'] = state_stats
  144. # 时间序列分析
  145. if 'creationTime' in df.columns and pd.api.types.is_datetime64_any_dtype(df['creationTime']):
  146. # 按日期分组
  147. df['date'] = df['creationTime'].dt.date
  148. daily_stats = df.groupby('date')['profit'].agg(['count', 'sum', 'mean']).reset_index()
  149. results['daily_stats'] = daily_stats
  150. # 按小时分组
  151. df['hour'] = df['creationTime'].dt.hour
  152. hourly_stats = df.groupby('hour')['profit'].agg(['count', 'sum', 'mean']).reset_index()
  153. results['hourly_stats'] = hourly_stats
  154. return results
  155. def visualize_data(df, results, output_dir='analysis_results'):
  156. """数据可视化"""
  157. if df is None or df.empty:
  158. print("没有数据可供可视化")
  159. return
  160. # 创建输出目录
  161. os.makedirs(output_dir, exist_ok=True)
  162. # 1. 利润分布直方图
  163. if 'profit' in df.columns:
  164. plt.figure(figsize=(10, 6))
  165. plt.hist(df['profit'].dropna(), bins=30, alpha=0.7)
  166. plt.title('利润分布')
  167. plt.xlabel('利润')
  168. plt.ylabel('频率')
  169. plt.grid(True, alpha=0.3)
  170. plt.savefig(os.path.join(output_dir, 'profit_distribution.png'))
  171. plt.close()
  172. # 2. 价差百分比与利润的散点图
  173. if all(col in df.columns for col in ['price_diff_pct', 'profit']):
  174. plt.figure(figsize=(10, 6))
  175. plt.scatter(df['price_diff_pct'], df['profit'], alpha=0.5)
  176. plt.title('价差百分比与利润的关系')
  177. plt.xlabel('价差百分比 (%)')
  178. plt.ylabel('利润')
  179. plt.grid(True, alpha=0.3)
  180. plt.savefig(os.path.join(output_dir, 'price_diff_vs_profit.png'))
  181. plt.close()
  182. # 3. 每日利润趋势
  183. if 'daily_stats' in results:
  184. daily_stats = results['daily_stats']
  185. plt.figure(figsize=(12, 6))
  186. plt.plot(daily_stats['date'], daily_stats['sum'], marker='o', linestyle='-')
  187. plt.title('每日总利润趋势')
  188. plt.xlabel('日期')
  189. plt.ylabel('总利润')
  190. plt.grid(True, alpha=0.3)
  191. plt.xticks(rotation=45)
  192. plt.tight_layout()
  193. plt.savefig(os.path.join(output_dir, 'daily_profit_trend.png'))
  194. plt.close()
  195. # 4. 按交易对的利润总和条形图
  196. if 'symbol_stats' in results:
  197. symbol_stats = results['symbol_stats']
  198. plt.figure(figsize=(12, 8))
  199. plt.barh(symbol_stats['symbol'], symbol_stats['sum'])
  200. plt.title('各交易对总利润')
  201. plt.xlabel('总利润')
  202. plt.ylabel('交易对')
  203. plt.grid(True, alpha=0.3)
  204. plt.tight_layout()
  205. plt.savefig(os.path.join(output_dir, 'symbol_profit.png'))
  206. plt.close()
  207. # 6. 按小时的平均利润
  208. if 'hourly_stats' in results:
  209. hourly_stats = results['hourly_stats']
  210. plt.figure(figsize=(12, 6))
  211. plt.bar(hourly_stats['hour'], hourly_stats['mean'])
  212. plt.title('各小时平均利润')
  213. plt.xlabel('小时')
  214. plt.ylabel('平均利润')
  215. plt.grid(True, alpha=0.3)
  216. plt.xticks(range(24))
  217. plt.tight_layout()
  218. plt.savefig(os.path.join(output_dir, 'hourly_avg_profit.png'))
  219. plt.close()
  220. def generate_report(results, output_dir='analysis_results'):
  221. """生成分析报告"""
  222. if not results:
  223. print("没有结果可供生成报告")
  224. return
  225. # 创建输出目录
  226. os.makedirs(output_dir, exist_ok=True)
  227. # 生成HTML报告
  228. html_content = """
  229. <!DOCTYPE html>
  230. <html>
  231. <head>
  232. <title>交易数据分析报告</title>
  233. <style>
  234. body { font-family: Arial, sans-serif; margin: 20px; }
  235. h1, h2 { color: #333; }
  236. table { border-collapse: collapse; width: 100%; margin-bottom: 20px; }
  237. th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
  238. th { background-color: #f2f2f2; }
  239. tr:nth-child(even) { background-color: #f9f9f9; }
  240. .image-container { margin: 20px 0; }
  241. .image-container img { max-width: 100%; height: auto; }
  242. </style>
  243. </head>
  244. <body>
  245. <h1>交易数据分析报告</h1>
  246. <p>生成时间: """ + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + """</p>
  247. """
  248. # 添加交易对统计
  249. if 'symbol_stats' in results:
  250. html_content += """
  251. <h2>交易对统计</h2>
  252. <table>
  253. <tr>
  254. <th>交易对</th>
  255. <th>交易次数</th>
  256. <th>总利润</th>
  257. <th>平均利润</th>
  258. <th>标准差</th>
  259. </tr>
  260. """
  261. for _, row in results['symbol_stats'].iterrows():
  262. html_content += f"""
  263. <tr>
  264. <td>{row['symbol']}</td>
  265. <td>{row['count']}</td>
  266. <td>{row['sum']:.4f}</td>
  267. <td>{row['mean']:.4f}</td>
  268. <td>{row['std']:.4f}</td>
  269. </tr>
  270. """
  271. html_content += "</table>"
  272. # 添加策略统计
  273. if 'strategy_stats' in results:
  274. html_content += """
  275. <h2>策略统计</h2>
  276. <table>
  277. <tr>
  278. <th>策略</th>
  279. <th>交易次数</th>
  280. <th>总利润</th>
  281. <th>平均利润</th>
  282. <th>标准差</th>
  283. </tr>
  284. """
  285. for _, row in results['strategy_stats'].iterrows():
  286. html_content += f"""
  287. <tr>
  288. <td>{row['strategy']}</td>
  289. <td>{row['count']}</td>
  290. <td>{row['sum']:.4f}</td>
  291. <td>{row['mean']:.4f}</td>
  292. <td>{row['std']:.4f}</td>
  293. </tr>
  294. """
  295. html_content += "</table>"
  296. # 添加状态统计
  297. if 'state_stats' in results:
  298. html_content += """
  299. <h2>状态统计</h2>
  300. <table>
  301. <tr>
  302. <th>状态</th>
  303. <th>交易次数</th>
  304. <th>总利润</th>
  305. <th>平均利润</th>
  306. </tr>
  307. """
  308. for _, row in results['state_stats'].iterrows():
  309. html_content += f"""
  310. <tr>
  311. <td>{row['currentState']}</td>
  312. <td>{row['count']}</td>
  313. <td>{row['sum']:.4f}</td>
  314. <td>{row['mean']:.4f}</td>
  315. </tr>
  316. """
  317. html_content += "</table>"
  318. # 添加每日统计
  319. if 'daily_stats' in results:
  320. html_content += """
  321. <h2>每日统计</h2>
  322. <table>
  323. <tr>
  324. <th>日期</th>
  325. <th>交易次数</th>
  326. <th>总利润</th>
  327. <th>平均利润</th>
  328. </tr>
  329. """
  330. for _, row in results['daily_stats'].iterrows():
  331. html_content += f"""
  332. <tr>
  333. <td>{row['date']}</td>
  334. <td>{row['count']}</td>
  335. <td>{row['sum']:.4f}</td>
  336. <td>{row['mean']:.4f}</td>
  337. </tr>
  338. """
  339. html_content += "</table>"
  340. # 添加图表
  341. html_content += """
  342. <h2>数据可视化</h2>
  343. <div class="image-container">
  344. <h3>利润分布</h3>
  345. <img src="profit_distribution.png" alt="利润分布">
  346. </div>
  347. <div class="image-container">
  348. <h3>价差百分比与利润的关系</h3>
  349. <img src="price_diff_vs_profit.png" alt="价差百分比与利润的关系">
  350. </div>
  351. <div class="image-container">
  352. <h3>每日总利润趋势</h3>
  353. <img src="daily_profit_trend.png" alt="每日总利润趋势">
  354. </div>
  355. <div class="image-container">
  356. <h3>各交易对总利润</h3>
  357. <img src="symbol_profit.png" alt="各交易对总利润">
  358. </div>
  359. <div class="image-container">
  360. <h3>各小时平均利润</h3>
  361. <img src="hourly_avg_profit.png" alt="各小时平均利润">
  362. </div>
  363. """
  364. # 结束HTML
  365. html_content += """
  366. </body>
  367. </html>
  368. """
  369. # 写入HTML文件
  370. with open(os.path.join(output_dir, 'analysis_report.html'), 'w', encoding='utf-8') as f:
  371. f.write(html_content)
  372. print(f"分析报告已生成: {os.path.join(output_dir, 'analysis_report.html')}")
  373. def main():
  374. """主函数"""
  375. import argparse
  376. parser = argparse.ArgumentParser(description='交易数据分析脚本')
  377. parser.add_argument('file_path', type=str, help='要分析的XLSX文件的路径')
  378. args = parser.parse_args()
  379. print(f"开始读取XLSX文件: {args.file_path}")
  380. df = read_xlsx_file(args.file_path)
  381. if df is not None and not df.empty:
  382. print(f"成功读取数据,共 {len(df)} 行")
  383. print("正在预处理数据...")
  384. processed_df = preprocess_data(df)
  385. print("正在分析数据...")
  386. results = analyze_data(processed_df)
  387. print("正在生成可视化图表...")
  388. visualize_data(processed_df, results)
  389. print("正在生成分析报告...")
  390. generate_report(results)
  391. print("分析完成!")
  392. else:
  393. print("没有数据可供分析")
  394. if __name__ == "__main__":
  395. main()