import pandas as pd
# 加载数据
data = pd.read_csv('student_data.csv')
# 数据清洗
def clean_data(df):
df.dropna(inplace=True)
return df
cleaned_data = clean_data(data)
# 数据分析
def analyze_data(df):
publication_count = df['publications'].sum()
avg_grades = df['grades'].mean()
return publication_count, avg_grades
publications, grades = analyze_data(cleaned_data)
print(f"Total Publications: {publications}, Average Grades: {grades}")
]]>
from sklearn.cluster import KMeans
# 特征选择
features = cleaned_data[['publications', 'grades']]
# 使用KMeans进行聚类
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(features)
cleaned_data['cluster'] = clusters
print(cleaned_data[['name', 'cluster']])
]]>