用1个Python脚本搞定脏数据清洗、统计分析和报告生成——真实教学场景实战

你好,我是提米哥,TMDM.cn 的首席选品官,专盯「开发者真能用、老师真敢交、学生真能学」的硬核项目。

今天不讲概念,不画大饼,就带你写一个老师明天就能拿去用的学生成绩处理工具——它不是玩具代码,不是“Hello World”式演示,而是一个完整、健壮、带日志、会报错、能存文件、输出整齐报告的真实程序。

你之前学过的所有 Python 基础:变量、函数、循环、列表推导、文件读写、异常处理、CSV 解析、日期时间……全都会在这里自然地“咔哒”一声咬合在一起。就像第一次把乐高零件拼成一辆能跑的车——零件没变,但世界变了。

下面就是完整实现,每一步都配了中文注释,新手也能照着敲、改、跑、懂。


✅ 第一步:准备测试数据(students.csv

把下面内容复制粘贴,保存为 students.csv 文件(和你的 .py 文件放同一文件夹):

name,score
Alex,92
Priya,78
Sam,
Jordan,61
Lisa,45
Ravi,88
null,55
Tom,102
Maya,73
Arjun,34
Zara,91
Ben,abc
Carlos,67
Nina,39
Oscar,85

💡 这份数据是「故意搞砸」的:空分数、非法姓名(null)、超分(102)、字母当分数(abc)……真实世界的数据,从来都是这样乱的。


✅ 第二步:加载数据(安全读取 CSV)

import csv
import json
from datetime import datetime


def load_students(filename):
    students = []

    try:
        # 安全打开文件:如果文件不存在,不会崩溃,而是友好提示
        with open(filename, "r") as file:
            reader = csv.DictReader(file)  # 按列名自动解析,每行变成 {"name": "...", "score": "..."}
            for row in reader:
                students.append({
                    "name": row["name"],      # 姓名保留原始字符串(后续再清洗)
                    "score": row["score"]     # 分数也先当字符串读进来(数字可能含空格或字母)
                })
    except FileNotFoundError:
        print(f"Error: {filename} not found.")
        return []  # 返回空列表,让后续流程知道“没数据可处理”

    print(f"Loaded {len(students)} raw records.")
    return students

✅ 第三步:清洗数据(拒绝脏数据,还告诉你为什么)

def clean_students(students):
    clean = []
    rejected = []

    for student in students:
        name = student["name"].strip()       # 去掉姓名前后空格(如 "  Alex  " → "Alex")
        score_raw = student["score"].strip() # 同样清理分数字符串

        # ❌ 规则1:姓名为空 或 等于 "null"(明显是占位符,不是真人)
        if not name or name.lower() == "null":
            rejected.append(f"Skipped: invalid name '{name}'")
            continue

        # ❌ 规则2:分数为空(比如 Sam 那行)
        if not score_raw:
            rejected.append(f"Skipped: {name} has no score")
            continue

        # ❌ 规则3:分数根本不是数字(比如 Ben 的 "abc")
        try:
            score = float(score_raw)  # 尝试转成小数(支持 85 或 85.0)
        except ValueError:
            rejected.append(f"Skipped: {name} has non-numeric score '{score_raw}'")
            continue

        # ❌ 规则4:分数超出合理范围(0~100)
        if score < 0 or score > 100:
            rejected.append(f"Skipped: {name} has out-of-range score {score}")
            continue

        # ✅ 全部通过 → 加入干净名单
        clean.append({"name": name, "score": score})

    # 打印清洗结果:多少条有效?哪些被踢了?为什么?
    print(f"\nCleaning complete:")
    print(f"  Valid records: {len(clean)}")
    print(f"  Rejected: {len(rejected)}")
    for reason in rejected:
        print(f"  {reason}")

    return clean

✅ 第四步:分析数据(算平均分、排名、及格率…全靠一行式列表推导)

def analyze(students):
    if not students:
        print("No valid students to analyze.")
        return None

    scores = [s["score"] for s in students]  # 提取所有分数,生成纯数字列表

    total = len(scores)
    average = sum(scores) / total
    highest = max(scores)
    lowest = min(scores)
    passed = [s for s in students if s["score"] >= 40]   # 及格生(≥40)
    failed = [s for s in students if s["score"] < 40]    # 不及格生
    top_performers = [s for s in students if s["score"] >= 85]  # 优等生(≥85)

    # 把所有结果打包成字典,方便后续打印和保存
    results = {
        "total_students": total,
        "average_score": round(average, 2),  # 保留2位小数
        "highest_score": highest,
        "lowest_score": lowest,
        "pass_count": len(passed),
        "fail_count": len(failed),
        "pass_rate": round((len(passed) / total) * 100, 1),  # 百分比,保留1位小数
        "top_performers": sorted(top_performers, key=lambda s: s["score"], reverse=True),  # 高分在前
        "failed_students": sorted(failed, key=lambda s: s["score"]),  # 低分在前(方便老师看谁最需帮扶)
        "all_students": sorted(students, key=lambda s: s["score"], reverse=True)  # 全班总排名
    }

    return results

✅ 第五步:终端显示报告(对齐、分区、清晰易读)

def display_results(results):
    if not results:
        return

    print("\n" + "=" * 45)
    print("        STUDENT GRADE REPORT")
    print("=" * 45)

    print(f"\nTotal Students : {results['total_students']}")
    print(f"Class Average  : {results['average_score']}")
    print(f"Highest Score  : {results['highest_score']}")
    print(f"Lowest Score   : {results['lowest_score']}")
    print(f"Pass Rate      : {results['pass_rate']}%")
    print(f"Passed         : {results['pass_count']}")
    print(f"Failed         : {results['fail_count']}")

    print("\n--- Top Performers (85+) ---")
    if results["top_performers"]:
        for s in results["top_performers"]:
            print(f"  {s['name']:<15} {s['score']}")  # <15 表示左对齐、占15字符宽度,表格感立刻出来
    else:
        print("  None")

    print("\n--- Students Who Failed (<40) ---")
    if results["failed_students"]:
        for s in results["failed_students"]:
            print(f"  {s['name']:<15} {s['score']}")
    else:
        print("  None")

    print("\n--- Full Rankings ---")
    for i, s in enumerate(results["all_students"], 1):
        status = "PASS" if s["score"] >= 40 else "FAIL"
        print(f"  {i}. {s['name']:<15} {s['score']:<8} {status}")

    print("=" * 45)

✅ 第六步:保存报告到文件(带时间戳,防覆盖)

def save_report(results, filename="report.txt"):
    if not results:
        return

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # 生成当前时间字符串

    try:
        with open(filename, "w") as file:
            file.write(f"Grade Report - Generated {timestamp}\n")
            file.write("=" * 45 + "\n\n")
            file.write(f"Total Students : {results['total_students']}\n")
            file.write(f"Class Average  : {results['average_score']}\n")
            file.write(f"Highest Score  : {results['highest_score']}\n")
            file.write(f"Lowest Score   : {results['lowest_score']}\n")
            file.write(f"Pass Rate      : {results['pass_rate']}%\n\n")

            file.write("Top Performers:\n")
            for s in results["top_performers"]:
                file.write(f"  {s['name']}: {s['score']}\n")

            file.write("\nFull Rankings:\n")
            for i, s in enumerate(results["all_students"], 1):
                status = "PASS" if s["score"] >= 40 else "FAIL"
                file.write(f"  {i}. {s['name']}: {s['score']} - {status}\n")

        print(f"\nReport saved to {filename}")

    except IOError as e:
        print(f"Could not save report: {e}")  # 磁盘满、权限不足等系统级错误

✅ 第七步:主程序(把所有模块串起来)

def main():
    print("Starting Grade Processor...")
    print("-" * 45)

    raw_students = load_students("students.csv")

    if not raw_students:
        print("Nothing to process.")
        return

    clean = clean_students(raw_students)

    if not clean:
        print("No valid data after cleaning.")
        return

    results = analyze(clean)
    display_results(results)
    save_report(results)

    print("\nDone.")


if __name__ == "__main__":
    main()

if __name__ == "__main__": 是 Python 的「守门员」:只有你双击运行这个 .py 文件时,main() 才会启动;如果别人把它当模块导入(import grade_processor),就不会自动执行——这是专业项目的标准写法。


▶️ 怎么运行?

在终端里,进入你放 students.csvgrade_processor.py 的文件夹,输入:

python grade_processor.py

你会看到:
– 清洗过程逐条反馈(哪几条被删了?为什么?)
– 终端输出一份排版工整的学生成绩报告
– 同时自动生成 report.txt 文件,双击就能发给老师


🔧 动手试试(这才是真正学会的关键)

别光看!马上做这3件事,你会立刻感觉「我真会用了」:
– 删除 students.csv,再运行程序 → 看它是否优雅报错,不崩溃
– 在 CSV 末尾加一行 Emma,87,再运行 → 看新同学是否自动进排名
– 把代码里所有 40 改成 50(及格线提高),再运行 → 看 Arjun 和 Nina 是否从“Fail”变成“Fail”,而 Lisa(45)也跟着挂科了

🌟 这就是真实开发:改一个数字,影响整个逻辑流。而你写的每一行,都在掌控它。

直达网址:https://tmdm.cn/developer/student-grade-processor

作加

类似文章