Quellcode durchsuchen

feat: 添加交易数据分析脚本和更新.gitignore

添加analyze_trading_data.py脚本用于读取、处理和分析交易数据
更新.gitignore以排除analysis_results目录
skyfffire vor 1 Woche
Ursprung
Commit
ca25e7a1ca
2 geänderte Dateien mit 526 neuen und 1 gelöschten Zeilen
  1. 3 1
      .gitignore
  2. 523 0
      analyze_trading_data.py

+ 3 - 1
.gitignore

@@ -42,4 +42,6 @@ htmlcov/
 
 # Local configuration
 .env
-.env.local
+.env.local
+
+analysis_results/

+ 523 - 0
analyze_trading_data.py

@@ -0,0 +1,523 @@
+import os
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import glob
+import json
+import re
+from datetime import datetime
+
+def extract_timestamp(timestamp_str):
+    """从时间戳字符串中提取日期时间"""
+    if isinstance(timestamp_str, tuple):
+        timestamp_str = timestamp_str[0]
+    
+    # 使用正则表达式匹配日期时间格式
+    match = re.search(r'(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})', timestamp_str)
+    if match:
+        return match.group(1)
+    return timestamp_str
+
+def parse_state_flow(state_flow_str):
+    """解析状态流字符串为Python对象"""
+    if isinstance(state_flow_str, str):
+        try:
+            # 替换单引号为双引号以便JSON解析
+            state_flow_str = state_flow_str.replace("'", '"')
+            return json.loads(state_flow_str)
+        except json.JSONDecodeError:
+            # 如果无法解析为JSON,尝试使用eval(注意:在生产环境中应避免使用eval)
+            try:
+                return eval(state_flow_str)
+            except:
+                return state_flow_str
+    return state_flow_str
+
+def read_xlsx_files(directory='xlsx'):
+    """读取目录中的所有xlsx文件并合并为一个DataFrame"""
+    # 获取目录中所有xlsx文件的路径
+    xlsx_files = glob.glob(os.path.join(directory, '*.xlsx'))
+    
+    # 排除临时文件
+    xlsx_files = [f for f in xlsx_files if not os.path.basename(f).startswith('~$')]
+    
+    if not xlsx_files:
+        print(f"在 {directory} 目录中未找到xlsx文件")
+        return None
+    
+    # 读取并合并所有文件
+    dfs = []
+    for file in xlsx_files:
+        try:
+            print(f"正在读取文件: {file}")
+            df = pd.read_excel(file)
+            dfs.append(df)
+        except Exception as e:
+            print(f"读取文件 {file} 时出错: {e}")
+    
+    if not dfs:
+        print("没有成功读取任何文件")
+        return None
+    
+    # 合并所有DataFrame
+    combined_df = pd.concat(dfs, ignore_index=True)
+    
+    # 设置列名
+    column_names = [
+        'pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice', 
+        'symbol', 'exchangeOutAmount', 'strategy', 'queryPriceUrl', 
+        'id', 'profit', 'creationTime', 'stateFlow', 'currentState'
+    ]
+    
+    # 如果列数不匹配,进行调整
+    if len(combined_df.columns) == len(column_names):
+        combined_df.columns = column_names
+    else:
+        print(f"警告: 列数不匹配。文件列数: {len(combined_df.columns)}, 预期列数: {len(column_names)}")
+        # 尝试使用前N列
+        combined_df.columns = column_names[:len(combined_df.columns)]
+    
+    return combined_df
+
+def read_xlsx_file(file_path):
+    """读取指定的xlsx文件并返回一个DataFrame"""
+    if not os.path.exists(file_path):
+        print(f"文件不存在: {file_path}")
+        return None
+    
+    try:
+        print(f"正在读取文件: {file_path}")
+        df = pd.read_excel(file_path)
+    except Exception as e:
+        print(f"读取文件 {file_path} 时出错: {e}")
+        return None
+    
+    # 设置列名
+    column_names = [
+        'pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice', 
+        'symbol', 'exchangeOutAmount', 'strategy', 'queryPriceUrl', 
+        'id', 'profit', 'creationTime', 'stateFlow', 'currentState'
+    ]
+    
+    # 如果列数不匹配,进行调整
+    if len(df.columns) == len(column_names):
+        df.columns = column_names
+    else:
+        print(f"警告: 列数不匹配。文件列数: {len(df.columns)}, 预期列数: {len(column_names)}")
+        # 尝试使用前N列
+        df.columns = column_names[:len(df.columns)]
+    
+    return df
+
+def preprocess_data(df):
+    """数据预处理"""
+    if df is None or df.empty:
+        print("没有数据可供处理")
+        return None
+    
+    # 创建副本避免警告
+    processed_df = df.copy()
+    
+    # 转换数值列
+    numeric_columns = ['pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice', 
+                       'exchangeOutAmount', 'profit']
+    
+    for col in numeric_columns:
+        if col in processed_df.columns:
+            processed_df[col] = pd.to_numeric(processed_df[col], errors='coerce')
+    
+    # 处理时间戳
+    if 'creationTime' in processed_df.columns:
+        processed_df['creationTime'] = processed_df['creationTime'].apply(extract_timestamp)
+        processed_df['creationTime'] = pd.to_datetime(processed_df['creationTime'], errors='coerce')
+    
+    # 计算价差百分比
+    if all(col in processed_df.columns for col in ['cexPrice', 'dexPrice']):
+        processed_df['price_diff_pct'] = (processed_df['cexPrice'] - processed_df['dexPrice']) / processed_df['dexPrice'] * 100
+    
+    # 计算交易金额
+    if all(col in processed_df.columns for col in ['exchangeOutAmount', 'cexPrice']):
+        processed_df['trade_value'] = processed_df['exchangeOutAmount'] * processed_df['cexPrice']
+    
+    return processed_df
+
+def analyze_data(df):
+    """基础数据分析"""
+    if df is None or df.empty:
+        print("没有数据可供分析")
+        return None
+    
+    results = {}
+    
+    # 基础统计分析
+    numeric_columns = ['pct', 'openLimit', 'closeLimit', 'cexPrice', 'dexPrice', 
+                       'exchangeOutAmount', 'profit', 'price_diff_pct', 'trade_value']
+    
+    # 计算基础统计量
+    stats = df[numeric_columns].describe()
+    results['basic_stats'] = stats
+    
+    # 按交易对分组统计
+    if 'symbol' in df.columns:
+        symbol_stats = df.groupby('symbol')['profit'].agg(['count', 'sum', 'mean', 'std']).reset_index()
+        symbol_stats = symbol_stats.sort_values('sum', ascending=False)
+        results['symbol_stats'] = symbol_stats
+    
+    # 按策略分组统计
+    if 'strategy' in df.columns:
+        strategy_stats = df.groupby('strategy')['profit'].agg(['count', 'sum', 'mean', 'std']).reset_index()
+        strategy_stats = strategy_stats.sort_values('sum', ascending=False)
+        results['strategy_stats'] = strategy_stats
+    
+    # 按状态分组统计
+    if 'currentState' in df.columns:
+        state_stats = df.groupby('currentState')['profit'].agg(['count', 'sum', 'mean']).reset_index()
+        results['state_stats'] = state_stats
+    
+    # 时间序列分析
+    if 'creationTime' in df.columns and pd.api.types.is_datetime64_any_dtype(df['creationTime']):
+        # 按日期分组
+        df['date'] = df['creationTime'].dt.date
+        daily_stats = df.groupby('date')['profit'].agg(['count', 'sum', 'mean']).reset_index()
+        results['daily_stats'] = daily_stats
+        
+        # 按小时分组
+        df['hour'] = df['creationTime'].dt.hour
+        hourly_stats = df.groupby('hour')['profit'].agg(['count', 'sum', 'mean']).reset_index()
+        results['hourly_stats'] = hourly_stats
+    
+    # 相关性分析
+    if len(numeric_columns) > 1:
+        correlation = df[numeric_columns].corr()
+        results['correlation'] = correlation
+    
+    return results
+
+def visualize_data(df, results, output_dir='analysis_results'):
+    """数据可视化"""
+    if df is None or df.empty:
+        print("没有数据可供可视化")
+        return
+    
+    # 创建输出目录
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # 1. 利润分布直方图
+    if 'profit' in df.columns:
+        plt.figure(figsize=(10, 6))
+        plt.hist(df['profit'].dropna(), bins=30, alpha=0.7)
+        plt.title('利润分布')
+        plt.xlabel('利润')
+        plt.ylabel('频率')
+        plt.grid(True, alpha=0.3)
+        plt.savefig(os.path.join(output_dir, 'profit_distribution.png'))
+        plt.close()
+    
+    # 2. 价差百分比与利润的散点图
+    if all(col in df.columns for col in ['price_diff_pct', 'profit']):
+        plt.figure(figsize=(10, 6))
+        plt.scatter(df['price_diff_pct'], df['profit'], alpha=0.5)
+        plt.title('价差百分比与利润的关系')
+        plt.xlabel('价差百分比 (%)')
+        plt.ylabel('利润')
+        plt.grid(True, alpha=0.3)
+        plt.savefig(os.path.join(output_dir, 'price_diff_vs_profit.png'))
+        plt.close()
+    
+    # 3. 每日利润趋势
+    if 'daily_stats' in results:
+        daily_stats = results['daily_stats']
+        plt.figure(figsize=(12, 6))
+        plt.plot(daily_stats['date'], daily_stats['sum'], marker='o', linestyle='-')
+        plt.title('每日总利润趋势')
+        plt.xlabel('日期')
+        plt.ylabel('总利润')
+        plt.grid(True, alpha=0.3)
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        plt.savefig(os.path.join(output_dir, 'daily_profit_trend.png'))
+        plt.close()
+    
+    # 4. 按交易对的利润总和条形图
+    if 'symbol_stats' in results:
+        symbol_stats = results['symbol_stats']
+        plt.figure(figsize=(12, 8))
+        plt.barh(symbol_stats['symbol'], symbol_stats['sum'])
+        plt.title('各交易对总利润')
+        plt.xlabel('总利润')
+        plt.ylabel('交易对')
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(os.path.join(output_dir, 'symbol_profit.png'))
+        plt.close()
+    
+    # 5. 相关性热图
+    if 'correlation' in results:
+        correlation = results['correlation']
+        plt.figure(figsize=(12, 10))
+        plt.imshow(correlation, cmap='coolwarm', vmin=-1, vmax=1)
+        plt.colorbar()
+        plt.title('特征相关性热图')
+        plt.xticks(range(len(correlation.columns)), correlation.columns, rotation=90)
+        plt.yticks(range(len(correlation.columns)), correlation.columns)
+        
+        # 在热图上添加相关系数值
+        for i in range(len(correlation.columns)):
+            for j in range(len(correlation.columns)):
+                plt.text(j, i, f'{correlation.iloc[i, j]:.2f}', 
+                         ha='center', va='center', 
+                         color='white' if abs(correlation.iloc[i, j]) > 0.5 else 'black')
+        
+        plt.tight_layout()
+        plt.savefig(os.path.join(output_dir, 'correlation_heatmap.png'))
+        plt.close()
+    
+    # 6. 按小时的平均利润
+    if 'hourly_stats' in results:
+        hourly_stats = results['hourly_stats']
+        plt.figure(figsize=(12, 6))
+        plt.bar(hourly_stats['hour'], hourly_stats['mean'])
+        plt.title('各小时平均利润')
+        plt.xlabel('小时')
+        plt.ylabel('平均利润')
+        plt.grid(True, alpha=0.3)
+        plt.xticks(range(24))
+        plt.tight_layout()
+        plt.savefig(os.path.join(output_dir, 'hourly_avg_profit.png'))
+        plt.close()
+
+def generate_report(results, output_dir='analysis_results'):
+    """生成分析报告"""
+    if not results:
+        print("没有结果可供生成报告")
+        return
+    
+    # 创建输出目录
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # 生成HTML报告
+    html_content = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>交易数据分析报告</title>
+        <style>
+            body { font-family: Arial, sans-serif; margin: 20px; }
+            h1, h2 { color: #333; }
+            table { border-collapse: collapse; width: 100%; margin-bottom: 20px; }
+            th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
+            th { background-color: #f2f2f2; }
+            tr:nth-child(even) { background-color: #f9f9f9; }
+            .image-container { margin: 20px 0; }
+            .image-container img { max-width: 100%; height: auto; }
+        </style>
+    </head>
+    <body>
+        <h1>交易数据分析报告</h1>
+        <p>生成时间: """ + datetime.now().strftime('%Y-%m-%d %H:%M:%S') + """</p>
+    """
+    
+    # 添加基础统计信息
+    if 'basic_stats' in results:
+        html_content += """
+        <h2>基础统计信息</h2>
+        <table>
+            <tr>
+                <th>指标</th>
+        """
+        
+        for col in results['basic_stats'].columns:
+            html_content += f"<th>{col}</th>"
+        
+        html_content += "</tr>"
+        
+        for idx, row in results['basic_stats'].iterrows():
+            html_content += f"<tr><td>{idx}</td>"
+            for col in results['basic_stats'].columns:
+                html_content += f"<td>{row[col]:.4f}</td>"
+            html_content += "</tr>"
+        
+        html_content += "</table>"
+    
+    # 添加交易对统计
+    if 'symbol_stats' in results:
+        html_content += """
+        <h2>交易对统计</h2>
+        <table>
+            <tr>
+                <th>交易对</th>
+                <th>交易次数</th>
+                <th>总利润</th>
+                <th>平均利润</th>
+                <th>标准差</th>
+            </tr>
+        """
+        
+        for _, row in results['symbol_stats'].iterrows():
+            html_content += f"""
+            <tr>
+                <td>{row['symbol']}</td>
+                <td>{row['count']}</td>
+                <td>{row['sum']:.4f}</td>
+                <td>{row['mean']:.4f}</td>
+                <td>{row['std']:.4f}</td>
+            </tr>
+            """
+        
+        html_content += "</table>"
+    
+    # 添加策略统计
+    if 'strategy_stats' in results:
+        html_content += """
+        <h2>策略统计</h2>
+        <table>
+            <tr>
+                <th>策略</th>
+                <th>交易次数</th>
+                <th>总利润</th>
+                <th>平均利润</th>
+                <th>标准差</th>
+            </tr>
+        """
+        
+        for _, row in results['strategy_stats'].iterrows():
+            html_content += f"""
+            <tr>
+                <td>{row['strategy']}</td>
+                <td>{row['count']}</td>
+                <td>{row['sum']:.4f}</td>
+                <td>{row['mean']:.4f}</td>
+                <td>{row['std']:.4f}</td>
+            </tr>
+            """
+        
+        html_content += "</table>"
+    
+    # 添加状态统计
+    if 'state_stats' in results:
+        html_content += """
+        <h2>状态统计</h2>
+        <table>
+            <tr>
+                <th>状态</th>
+                <th>交易次数</th>
+                <th>总利润</th>
+                <th>平均利润</th>
+            </tr>
+        """
+        
+        for _, row in results['state_stats'].iterrows():
+            html_content += f"""
+            <tr>
+                <td>{row['currentState']}</td>
+                <td>{row['count']}</td>
+                <td>{row['sum']:.4f}</td>
+                <td>{row['mean']:.4f}</td>
+            </tr>
+            """
+        
+        html_content += "</table>"
+    
+    # 添加每日统计
+    if 'daily_stats' in results:
+        html_content += """
+        <h2>每日统计</h2>
+        <table>
+            <tr>
+                <th>日期</th>
+                <th>交易次数</th>
+                <th>总利润</th>
+                <th>平均利润</th>
+            </tr>
+        """
+        
+        for _, row in results['daily_stats'].iterrows():
+            html_content += f"""
+            <tr>
+                <td>{row['date']}</td>
+                <td>{row['count']}</td>
+                <td>{row['sum']:.4f}</td>
+                <td>{row['mean']:.4f}</td>
+            </tr>
+            """
+        
+        html_content += "</table>"
+    
+    # 添加图表
+    html_content += """
+        <h2>数据可视化</h2>
+        
+        <div class="image-container">
+            <h3>利润分布</h3>
+            <img src="profit_distribution.png" alt="利润分布">
+        </div>
+        
+        <div class="image-container">
+            <h3>价差百分比与利润的关系</h3>
+            <img src="price_diff_vs_profit.png" alt="价差百分比与利润的关系">
+        </div>
+        
+        <div class="image-container">
+            <h3>每日总利润趋势</h3>
+            <img src="daily_profit_trend.png" alt="每日总利润趋势">
+        </div>
+        
+        <div class="image-container">
+            <h3>各交易对总利润</h3>
+            <img src="symbol_profit.png" alt="各交易对总利润">
+        </div>
+        
+        <div class="image-container">
+            <h3>特征相关性热图</h3>
+            <img src="correlation_heatmap.png" alt="特征相关性热图">
+        </div>
+        
+        <div class="image-container">
+            <h3>各小时平均利润</h3>
+            <img src="hourly_avg_profit.png" alt="各小时平均利润">
+        </div>
+    """
+    
+    # 结束HTML
+    html_content += """
+    </body>
+    </html>
+    """
+    
+    # 写入HTML文件
+    with open(os.path.join(output_dir, 'analysis_report.html'), 'w', encoding='utf-8') as f:
+        f.write(html_content)
+    
+    print(f"分析报告已生成: {os.path.join(output_dir, 'analysis_report.html')}")
+
+def main():
+    """主函数"""
+    import argparse
+    parser = argparse.ArgumentParser(description='交易数据分析脚本')
+    parser.add_argument('file_path', type=str, help='要分析的XLSX文件的路径')
+    args = parser.parse_args()
+
+    print(f"开始读取XLSX文件: {args.file_path}")
+    df = read_xlsx_file(args.file_path)
+    
+    if df is not None and not df.empty:
+        print(f"成功读取数据,共 {len(df)} 行")
+        
+        print("正在预处理数据...")
+        processed_df = preprocess_data(df)
+        
+        print("正在分析数据...")
+        results = analyze_data(processed_df)
+        
+        print("正在生成可视化图表...")
+        visualize_data(processed_df, results)
+        
+        print("正在生成分析报告...")
+        generate_report(results)
+        
+        print("分析完成!")
+    else:
+        print("没有数据可供分析")
+
+if __name__ == "__main__":
+    main()