147 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			147 lines
		
	
	
		
			5.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|  | import pandas as pd | ||
|  | from pymongo import MongoClient | ||
|  | import numpy as np | ||
|  | from datetime import datetime | ||
|  | from concurrent.futures import ThreadPoolExecutor | ||
|  | import statsmodels.api as sm | ||
|  | # 数据库连接配置 | ||
|  | config = { | ||
|  |     'host': 'www.bldcapital.cn', | ||
|  |     'port': 27217, | ||
|  |     'username': 'ZhangZH', | ||
|  |     'password': 'S$#r)JAHE_2C', | ||
|  |     'authSource': 'Alpha101',  # 指定认证数据库 | ||
|  |     'authMechanism': 'SCRAM-SHA-1'  # 指定认证机制 | ||
|  | } | ||
|  | client = MongoClient(**config) | ||
|  | client.admin.command('ping') | ||
|  | print("成功连接到MongoDB服务器")   | ||
|  | db = client['Alpha101'] | ||
|  | #取中证500只 | ||
|  | stocklist = pd.read_csv("C:/Users/86185/Desktop/实习/500stock.csv")   | ||
|  | # 提取 "Stock Code" 列,转成列表 | ||
|  | selected = stocklist['Stock Code'].tolist() | ||
|  | 
 | ||
|  | START = datetime(2020, 1, 1) | ||
|  | END   = datetime(2021, 1, 1) | ||
|  | projection={ | ||
|  |     '_id':0, | ||
|  |     'alpha004':1, | ||
|  |     'time':1 | ||
|  |     } | ||
|  | def fetch(code: str) -> pd.DataFrame: | ||
|  |     cur = db[code].find({'time': {'$gte': START, '$lt': END}}, projection) | ||
|  |     df = pd.DataFrame(list(cur)) | ||
|  |     df['Code'] = code | ||
|  |     return df | ||
|  | 
 | ||
|  | with ThreadPoolExecutor(max_workers=32) as pool: | ||
|  |     dfs = list(pool.map(fetch, selected)) | ||
|  | 
 | ||
|  | # 合并 | ||
|  | df_all = pd.concat(dfs, ignore_index=True) | ||
|  | df_all['time'] = pd.to_datetime(df_all['time'])  # 转日期类型 | ||
|  | df_all_ = df_all.pivot(index='Code', columns='time', values='alpha004') | ||
|  | df_all_.info() | ||
|  | #---------------------去极值------------------------- | ||
|  | def extreme_3sigma(dt,n=3): | ||
|  | 
 | ||
|  |     mean = dt.mean()           # 截面数据均值 | ||
|  |     std = dt.std()             # 截面数据标准差 | ||
|  |     dt_up = mean + n*std       # 上限 | ||
|  |     dt_down = mean - n*std     # 下限 | ||
|  |     return dt.clip(dt_down, dt_up, axis=1)   # 超出上下限的值,赋值为上下限 | ||
|  | 
 | ||
|  | df1=extreme_3sigma(df_all_,3) | ||
|  | print('step1 finished') | ||
|  | #---------------------标准化------------------------- | ||
|  | def standardize_z(dt): | ||
|  |     mean = dt.mean()     #  截面数据均值 | ||
|  |     std = dt.std()       #  截面数据标准差 | ||
|  |     return (dt - mean)/std     # 标准化处理 | ||
|  | 
 | ||
|  | df2 = standardize_z(df1) | ||
|  | print('step2 finished') | ||
|  | #----------------------3.行业中性化-------------------------- | ||
|  | #1生成行业哑变量矩阵 | ||
|  | def generate_industry_dummy_matrix(csv_file_path): | ||
|  |     # 从 CSV 文件中读取数据 | ||
|  |     industry= pd.read_csv(csv_file_path) | ||
|  |     # 获取所有唯一的股票代码和行业名称 | ||
|  |     unique_stocks = industry['code'].unique() | ||
|  |     unique_industries = industry['industry_name'].unique() | ||
|  |     # 创建一个全 0 的 DataFrame 作为初始的哑变量矩阵 | ||
|  |     dummy_matrix = pd.DataFrame(0, index=unique_stocks, columns=unique_industries) | ||
|  |     # 遍历每一行数据,将对应股票在其所属行业的位置赋值为 1 | ||
|  |     for index, row in industry.iterrows(): | ||
|  |         stock = row['code'] | ||
|  |         industry = row['industry_name'] | ||
|  |         dummy_matrix.loc[stock, industry] = 1 | ||
|  |     return dummy_matrix | ||
|  | dummy_matrix= generate_industry_dummy_matrix('C:/Users/86185/Desktop/实习/industry.csv') | ||
|  | ##取股票交集 | ||
|  | market=pd.read_csv('C:/Users/86185/Desktop/实习/market.csv') | ||
|  | market.set_index(market.columns[0], inplace=True) | ||
|  | common_stocks = list( | ||
|  |     set(df2.index) &  | ||
|  |     set(dummy_matrix.index)& | ||
|  |     set(market.index) | ||
|  | ) | ||
|  | # 筛选三个数据框,仅保留共同的股票 | ||
|  | df2_f= df2.loc[common_stocks].copy() | ||
|  | industry_f = dummy_matrix.loc[common_stocks].copy() | ||
|  | market_f = market.loc[common_stocks].copy().astype(float) | ||
|  | #2实现行业中性化 | ||
|  | def neutralization(factor, industry_matrix): | ||
|  |     Y = factor | ||
|  |     df= pd.DataFrame(index=Y.index, columns=Y.columns) | ||
|  |     # 按日期循环进行行业中性化 | ||
|  |     for i in range(Y.shape[1]): | ||
|  |         # 获取当天的因子值(删除缺失值) | ||
|  |         y = Y.iloc[:, i].dropna()         | ||
|  |         # 获取对应的行业哑变量(并确保与 y 索引一致) | ||
|  |         X = industry_matrix.loc[y.index]      | ||
|  |         # 添加常数项 | ||
|  |         X = sm.add_constant(X)      | ||
|  |         # 执行线性回归 | ||
|  |         try: | ||
|  |             result = sm.OLS(y, X).fit() | ||
|  |             # 保存残差(即行业中性化后的因子值) | ||
|  |             df.loc[y.index, Y.columns[i]] = result.resid | ||
|  |         except Exception as e: | ||
|  |             print(f"日期 {Y.columns[i]} 中性化失败: {e}") | ||
|  |             # 失败时保留原始值或填充 NaN | ||
|  |             df.loc[y.index, Y.columns[i]] = y     | ||
|  |     return df | ||
|  | df3=neutralization(df2_f, industry_f) | ||
|  | print('step3 finished') | ||
|  | #----------------------4.市值中性化-------------------------- | ||
|  | def neutralization_size(factor, market_cap): | ||
|  |     Y = factor.astype(float) | ||
|  |     df= pd.DataFrame(index=Y.index, columns=Y.columns)     | ||
|  |     for i in range(Y.shape[1]): | ||
|  |         # 获取当天的因子值(删除缺失值) | ||
|  |         y = Y.iloc[:, i].dropna() | ||
|  |         # 获取当天的市值并计算对数(删除缺失值和非正数) | ||
|  |         ln_mkt_cap = np.log(market_cap.iloc[:, i]) | ||
|  |         ln_mkt_cap = ln_mkt_cap.replace([np.inf, -np.inf], np.nan).dropna() | ||
|  |         # 确保市值和因子的股票索引一致 | ||
|  |         common_idx = list(set(y.index).intersection(set(ln_mkt_cap.index))) | ||
|  |         y = y.loc[common_idx] | ||
|  |         ln_mkt_cap = ln_mkt_cap.loc[common_idx] | ||
|  |         # 添加常数项 | ||
|  |         X = sm.add_constant(ln_mkt_cap) | ||
|  |         # 执行线性回归 | ||
|  |         try: | ||
|  |             result = sm.OLS(y, X).fit() | ||
|  |             # 保存残差(即市值中性化后的因子值) | ||
|  |             df.loc[common_idx, Y.columns[i]] = result.resid | ||
|  |         except Exception as e: | ||
|  |             print(f"日期 {Y.columns[i]} 中性化失败: {e}") | ||
|  |             # 失败时保留原始值或填充 NaN | ||
|  |             df.loc[common_idx, Y.columns[i]] = y | ||
|  |      | ||
|  |     return df | ||
|  | df4=neutralization_size(df3, market_f) | ||
|  | df4.to_csv('df4.csv') | ||
|  | print('step4 finished') |