爬虫实现QQ群作业的批量下载
前提:使用者需要有该群聊的管理者权利,才可以下所有人的作业
下载结果:该博文可以实现将群内所有作业全部下载下来放在downloaded,并按照每个实验单独一个文件夹、每个实验内按照n个群成员生成n个对应文件夹(作业放在该文件夹中)
一、qq群抓包
由于批量下载的爬虫代码中需要获取QQ群聊相关的QQ群号、cookie、bkn信息,因此需要从网页中获取相关值
参考:解密之QQ的bkn值,获取QQ群成员信息,获取QQ好友列表信息
先进入https://qun.qq.com/member.html
,其中的cookie和bkn都可以在打开fn+12后切换群聊可以在控制台的网络中获取该值
拿到后再运行二中的代码,即可实现批量下载
二、批量下载
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218import requestsimport sqlite3import refrom concurrent.futures import ThreadPoolExecutorfrom concurrent.futures import wait as pool_waitfrom tqdm import tqdm # 进度条库from urllib.parse import urlparseimport uuidimport osgroup = input("Group: ")cookie = input("Cookie: ")bkn = input("bkn: ")all_homework = []for i in range(1, 9999): print("get homework list... page " + str(i)) r = requests.post("https://qun.qq.com/cgi-bin/homework/hw/get_hw_list.fcg", data={ "num": i, "group_id": group, "cmd": 21, "page_size": 20, "client_type": 1, "bkn": bkn }, headers={ "Referer": "https://qun.qq.com/homework/p/features/index.html", "Origin": "https://qun.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) QQ/9.2.3.26683 Chrome/43.0.2357.134 Safari/537.36 QBCore/3.43.1297.400 QQBrowser/9.0.2524.400", "Cookie": cookie }, verify=False) r = r.json() print(r) if r['data']['end_flag'] == 1: break for entry in r['data']['homework']: all_homework.append(entry)print("total: " + str(len(all_homework)))print(all_homework)# get all students' homework statusdetails_notyet = dict()details_finish = dict()for entry in all_homework: while True: try: print("get detail..." + str(entry['hw_id'])) r = requests.post("https://qun.qq.com/cgi-bin/homework/fb/get_hw_feedback.fcg", data={ "group_id": group, "hw_id": entry['hw_id'], "status": "[0,1]", "page": 1, "page_size": 2000, "need_userinfo": 1, "type": "notyet", "client_type": 1, "bkn": bkn }, headers={ "Referer": "https://qun.qq.com/homework/p/features/index.html", "Origin": "https://qun.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) QQ/9.2.3.26683 Chrome/43.0.2357.134 Safari/537.36 QBCore/3.43.1297.400 QQBrowser/9.0.2524.400", "Cookie": cookie }, verify=False) r = r.json() print(r) details_notyet[entry['hw_id']] = r r = requests.post("https://qun.qq.com/cgi-bin/homework/fb/get_hw_feedback.fcg", data={ "group_id": group, "hw_id": entry['hw_id'], "status": "[2,3]", "page": 1, "page_size": 2000, "need_userinfo": 1, "type": "finish", "client_type": 1, "bkn": bkn }, headers={ "Referer": "https://qun.qq.com/homework/p/features/index.html", "Origin": "https://qun.qq.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) QQ/9.2.3.26683 Chrome/43.0.2357.134 Safari/537.36 QBCore/3.43.1297.400 QQBrowser/9.0.2524.400", "Cookie": cookie }, verify=False) r = r.json() print(r) details_finish[entry['hw_id']] = r break except Exception as e: print(e)# write to dbprint("write to db...")db = sqlite3.connect("homework.db")c = db.cursor()for homework_id in details_notyet: c.execute(""" CREATE TABLE HOMEWORK_""" + str(homework_id) + """( NAME VARCHAR(30) PRIMARY KEY NOT NULL, FINISHED INTEGER, CONTENT VARCHAR NOT NULL ); """) db.commit()for homework_id in details_notyet: try: for stu in details_notyet[homework_id]['data']['feedback']: c.execute(""" INSERT INTO HOMEWORK_""" + str(homework_id) + """ VALUES (?, 0, ?); """, (stu['nick'], str(stu))) db.commit() except Exception as e: print("no notyet " + str(e)) try: for stu in details_finish[homework_id]['data']['feedback']: c.execute(""" INSERT INTO HOMEWORK_""" + str(homework_id) + """ VALUES (?, 1, ?); """, (stu['nick'], str(stu))) db.commit() except Exception as e: print("no finish " + str(e))# find urlsall_urls = set()regex = re.compile(r'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]')for i in details_notyet: for i2 in regex.finditer(str(details_notyet[i])): all_urls.add(i2.group()) for i2 in regex.finditer(str(details_finish[i])): all_urls.add(i2.group())print("total urls: " + str(len(all_urls)))## download filesdef download_and_save(homework_id, student_folder, file_info, max_retries=3): file_name = file_info['name'] target_url = file_info['url'] # 创建学生文件夹 student_path = os.path.join("downloaded", homework_id, student_folder) os.makedirs(student_path, exist_ok=True) file_path = os.path.join(student_path, file_name) # 如果文件已经存在,则跳过下载 if os.path.exists(file_path): print(f"{file_name} already exists, skipping.") return True for attempt in range(max_retries + 1): try: with requests.get(target_url, stream=True, verify=False, timeout=(5, None)) as r: # 增加连接和读取超时 r.raise_for_status() # 检查请求是否成功 total_size = int(r.headers.get('content-length', 0)) progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc=file_name) with open(file_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): # 使用合适的块大小进行迭代 if chunk: # 过滤掉保持活动的空chunk f.write(chunk) progress_bar.update(len(chunk)) progress_bar.close() if total_size != 0 and progress_bar.n != total_size: print(f"Download of {file_name} did not complete.") continue print(f"saved to {file_path}") return True except Exception as e: print(f"Attempt {attempt + 1}/{max_retries} failed to download {file_name}: {e}") if attempt == max_retries: print(f"Failed to download {file_name} after {max_retries} attempts.") return False# 创建线程池执行器pool = ThreadPoolExecutor(max_workers=20)# 修改数据库内容以适应新的下载逻辑all_tasks = []for homework_id in details_finish: for stu in details_finish[homework_id]['data']['feedback']: if 'content' in stu and 'main' in stu['content']: for item in stu['content']['main']: if 'text' in item and 'c' in item['text'] and isinstance(item['text']['c'], list): for file_info in item['text']['c']: if 'type' in file_info and file_info['type'] == 'file': student_folder = f"{stu['nick']}_{stu['uin']}" all_tasks.append( pool.submit(download_and_save, str(homework_id), student_folder, file_info) )# 等待所有下载任务完成pool_wait(all_tasks)print("All downloads completed.")# 关闭线程池执行器pool.shutdown(wait=True)print("Thread pool executor has been shut down.")db.close()print("All done!")
代码仓库:https://github.com/AkiraZheng/QQHomeworkBatchTool.git
参考:Time Machine -
QQ作业爬虫
红警地图应该放在哪里?存放在哪个文件夹合适?
2017 首爾Kimjang(越冬泡菜)文化節