# 创建统计特征 for column in numeric_columns:
processed_data[f'{column}_log']= np.log1p(data[column])
processed_data[f'{column}_sqrt']= np.sqrt(np.abs(data[column]))
processed_data[f'{column}_square']= data[column] ** 2
return processed_data
# 使用示例
processor = NumericDataProcessor()
# 检测异常值
outliers = processor.detect_outliers(continuous_df) print("\n异常值检测结果:") for column, info in outliers.items(): if info['count']>0: print(f"{column}: {info['count']} 个异常值 ({info['percentage']:.2f}%)")
# 文本数据处理方法 import jieba from sklearn.feature_extraction.textimport TfidfVectorizer, CountVectorizer from sklearn.preprocessingimport LabelEncoder
class TextDataProcessor: def__init__(self): self.vectorizers={} self.label_encoders={}
def clean_text(self, text): """文本清洗""" # 移除特殊字符和数字
text =re.sub(r'[^\u4e00-\u9fff\s]','', text) # 移除多余空格
text =re.sub(r'\s+',' ', text).strip() return text
def tokenize_chinese(self, text): """中文分词"""
words = jieba.lcut(text) # 移除停用词(简化版)
stop_words ={'的','了','在','是','我','有','和','就','不','人','都','一','一个','上','也','很','到','说','要','去','你','会','着','没有','看','好','自己','这'}
words =[word for word in words if word notin stop_words andlen(word)>1] return words
def extract_features(self, texts, method='tfidf'): """特征提取""" # 文本预处理
cleaned_texts =[self.clean_text(text)for text in texts]
# 提取颜色特征
color_features = image_processor.extract_color_features(rainbow_img) print("\n颜色特征:") for key, value in color_features.items(): ifnot key.startswith('hist'): print(f"{key}: {value:.2f}")
# 提取纹理特征
texture_features = image_processor.extract_texture_features(checkerboard_img) print("\n纹理特征:") for key, value in texture_features.items(): if key !='lbp_hist': print(f"{key}: {value:.4f}")
# 打印处理结果 for modality, features in processed_features.items(): ifisinstance(features, pd.DataFrame): print(f"{modality} 特征:{features.shape}") else: print(f"{modality} 特征:{features.shape}")
for modality, features in processed_features.items(): ifisinstance(features, pd.DataFrame):
feature_arrays.append(features.values) else:
feature_arrays.append(features)