用1个Python脚本搞定脏数据清洗、统计分析和报告生成——真实教学场景实战
你好,我是提米哥,TMDM.cn 的首席选品官,专盯「开发者真能用、老师真敢交、学生真能学」的硬核项目。
今天不讲概念,不画大饼,就带你写一个老师明天就能拿去用的学生成绩处理工具——它不是玩具代码,不是“Hello World”式演示,而是一个完整、健壮、带日志、会报错、能存文件、输出整齐报告的真实程序。
你之前学过的所有 Python 基础:变量、函数、循环、列表推导、文件读写、异常处理、CSV 解析、日期时间……全都会在这里自然地“咔哒”一声咬合在一起。就像第一次把乐高零件拼成一辆能跑的车——零件没变,但世界变了。
下面就是完整实现,每一步都配了中文注释,新手也能照着敲、改、跑、懂。
✅ 第一步:准备测试数据(students.csv)
把下面内容复制粘贴,保存为 students.csv 文件(和你的 .py 文件放同一文件夹):
name,score
Alex,92
Priya,78
Sam,
Jordan,61
Lisa,45
Ravi,88
null,55
Tom,102
Maya,73
Arjun,34
Zara,91
Ben,abc
Carlos,67
Nina,39
Oscar,85
💡 这份数据是「故意搞砸」的:空分数、非法姓名(
null)、超分(102)、字母当分数(abc)……真实世界的数据,从来都是这样乱的。
✅ 第二步:加载数据(安全读取 CSV)
import csv
import json
from datetime import datetime
def load_students(filename):
students = []
try:
# 安全打开文件:如果文件不存在,不会崩溃,而是友好提示
with open(filename, "r") as file:
reader = csv.DictReader(file) # 按列名自动解析,每行变成 {"name": "...", "score": "..."}
for row in reader:
students.append({
"name": row["name"], # 姓名保留原始字符串(后续再清洗)
"score": row["score"] # 分数也先当字符串读进来(数字可能含空格或字母)
})
except FileNotFoundError:
print(f"Error: {filename} not found.")
return [] # 返回空列表,让后续流程知道“没数据可处理”
print(f"Loaded {len(students)} raw records.")
return students
✅ 第三步:清洗数据(拒绝脏数据,还告诉你为什么)
def clean_students(students):
clean = []
rejected = []
for student in students:
name = student["name"].strip() # 去掉姓名前后空格(如 " Alex " → "Alex")
score_raw = student["score"].strip() # 同样清理分数字符串
# ❌ 规则1:姓名为空 或 等于 "null"(明显是占位符,不是真人)
if not name or name.lower() == "null":
rejected.append(f"Skipped: invalid name '{name}'")
continue
# ❌ 规则2:分数为空(比如 Sam 那行)
if not score_raw:
rejected.append(f"Skipped: {name} has no score")
continue
# ❌ 规则3:分数根本不是数字(比如 Ben 的 "abc")
try:
score = float(score_raw) # 尝试转成小数(支持 85 或 85.0)
except ValueError:
rejected.append(f"Skipped: {name} has non-numeric score '{score_raw}'")
continue
# ❌ 规则4:分数超出合理范围(0~100)
if score < 0 or score > 100:
rejected.append(f"Skipped: {name} has out-of-range score {score}")
continue
# ✅ 全部通过 → 加入干净名单
clean.append({"name": name, "score": score})
# 打印清洗结果:多少条有效?哪些被踢了?为什么?
print(f"\nCleaning complete:")
print(f" Valid records: {len(clean)}")
print(f" Rejected: {len(rejected)}")
for reason in rejected:
print(f" {reason}")
return clean
✅ 第四步:分析数据(算平均分、排名、及格率…全靠一行式列表推导)
def analyze(students):
if not students:
print("No valid students to analyze.")
return None
scores = [s["score"] for s in students] # 提取所有分数,生成纯数字列表
total = len(scores)
average = sum(scores) / total
highest = max(scores)
lowest = min(scores)
passed = [s for s in students if s["score"] >= 40] # 及格生(≥40)
failed = [s for s in students if s["score"] < 40] # 不及格生
top_performers = [s for s in students if s["score"] >= 85] # 优等生(≥85)
# 把所有结果打包成字典,方便后续打印和保存
results = {
"total_students": total,
"average_score": round(average, 2), # 保留2位小数
"highest_score": highest,
"lowest_score": lowest,
"pass_count": len(passed),
"fail_count": len(failed),
"pass_rate": round((len(passed) / total) * 100, 1), # 百分比,保留1位小数
"top_performers": sorted(top_performers, key=lambda s: s["score"], reverse=True), # 高分在前
"failed_students": sorted(failed, key=lambda s: s["score"]), # 低分在前(方便老师看谁最需帮扶)
"all_students": sorted(students, key=lambda s: s["score"], reverse=True) # 全班总排名
}
return results
✅ 第五步:终端显示报告(对齐、分区、清晰易读)
def display_results(results):
if not results:
return
print("\n" + "=" * 45)
print(" STUDENT GRADE REPORT")
print("=" * 45)
print(f"\nTotal Students : {results['total_students']}")
print(f"Class Average : {results['average_score']}")
print(f"Highest Score : {results['highest_score']}")
print(f"Lowest Score : {results['lowest_score']}")
print(f"Pass Rate : {results['pass_rate']}%")
print(f"Passed : {results['pass_count']}")
print(f"Failed : {results['fail_count']}")
print("\n--- Top Performers (85+) ---")
if results["top_performers"]:
for s in results["top_performers"]:
print(f" {s['name']:<15} {s['score']}") # <15 表示左对齐、占15字符宽度,表格感立刻出来
else:
print(" None")
print("\n--- Students Who Failed (<40) ---")
if results["failed_students"]:
for s in results["failed_students"]:
print(f" {s['name']:<15} {s['score']}")
else:
print(" None")
print("\n--- Full Rankings ---")
for i, s in enumerate(results["all_students"], 1):
status = "PASS" if s["score"] >= 40 else "FAIL"
print(f" {i}. {s['name']:<15} {s['score']:<8} {status}")
print("=" * 45)
✅ 第六步:保存报告到文件(带时间戳,防覆盖)
def save_report(results, filename="report.txt"):
if not results:
return
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 生成当前时间字符串
try:
with open(filename, "w") as file:
file.write(f"Grade Report - Generated {timestamp}\n")
file.write("=" * 45 + "\n\n")
file.write(f"Total Students : {results['total_students']}\n")
file.write(f"Class Average : {results['average_score']}\n")
file.write(f"Highest Score : {results['highest_score']}\n")
file.write(f"Lowest Score : {results['lowest_score']}\n")
file.write(f"Pass Rate : {results['pass_rate']}%\n\n")
file.write("Top Performers:\n")
for s in results["top_performers"]:
file.write(f" {s['name']}: {s['score']}\n")
file.write("\nFull Rankings:\n")
for i, s in enumerate(results["all_students"], 1):
status = "PASS" if s["score"] >= 40 else "FAIL"
file.write(f" {i}. {s['name']}: {s['score']} - {status}\n")
print(f"\nReport saved to {filename}")
except IOError as e:
print(f"Could not save report: {e}") # 磁盘满、权限不足等系统级错误
✅ 第七步:主程序(把所有模块串起来)
def main():
print("Starting Grade Processor...")
print("-" * 45)
raw_students = load_students("students.csv")
if not raw_students:
print("Nothing to process.")
return
clean = clean_students(raw_students)
if not clean:
print("No valid data after cleaning.")
return
results = analyze(clean)
display_results(results)
save_report(results)
print("\nDone.")
if __name__ == "__main__":
main()
✅
if __name__ == "__main__":是 Python 的「守门员」:只有你双击运行这个.py文件时,main()才会启动;如果别人把它当模块导入(import grade_processor),就不会自动执行——这是专业项目的标准写法。
▶️ 怎么运行?
在终端里,进入你放 students.csv 和 grade_processor.py 的文件夹,输入:
python grade_processor.py
你会看到:
– 清洗过程逐条反馈(哪几条被删了?为什么?)
– 终端输出一份排版工整的学生成绩报告
– 同时自动生成 report.txt 文件,双击就能发给老师
🔧 动手试试(这才是真正学会的关键)
别光看!马上做这3件事,你会立刻感觉「我真会用了」:
– 删除 students.csv,再运行程序 → 看它是否优雅报错,不崩溃
– 在 CSV 末尾加一行 Emma,87,再运行 → 看新同学是否自动进排名
– 把代码里所有 40 改成 50(及格线提高),再运行 → 看 Arjun 和 Nina 是否从“Fail”变成“Fail”,而 Lisa(45)也跟着挂科了
🌟 这就是真实开发:改一个数字,影响整个逻辑流。而你写的每一行,都在掌控它。
直达网址:https://tmdm.cn/developer/student-grade-processor
