整理美团数据.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. import json
  2. import os
  3. import re
  4. import requests
  5. from pathlib import Path
  6. from urllib.parse import urlparse
  7. from tkinter import *
  8. from tkinter import ttk, filedialog, messagebox, scrolledtext
  9. import threading
  10. from datetime import datetime
  11. class ProductDownloaderGUI:
  12. def __init__(self, root):
  13. self.root = root
  14. self.root.title("商品资源下载器")
  15. self.root.geometry("900x700")
  16. self.root.resizable(True, True)
  17. # 设置样式
  18. style = ttk.Style()
  19. style.theme_use('clam')
  20. # 创建主框架
  21. main_frame = ttk.Frame(root, padding="10")
  22. main_frame.pack(fill=BOTH, expand=True)
  23. # 输入区域
  24. input_frame = ttk.LabelFrame(main_frame, text="商品数据输入", padding="10")
  25. input_frame.pack(fill=BOTH, expand=True, pady=(0, 10))
  26. # 输入框
  27. self.input_text = scrolledtext.ScrolledText(input_frame, height=15, font=("Consolas", 10))
  28. self.input_text.pack(fill=BOTH, expand=True)
  29. # 输出目录选择
  30. output_frame = ttk.LabelFrame(main_frame, text="输出设置", padding="10")
  31. output_frame.pack(fill=X, pady=(0, 10))
  32. dir_select_frame = ttk.Frame(output_frame)
  33. dir_select_frame.pack(fill=X)
  34. self.output_dir = StringVar()
  35. self.output_entry = ttk.Entry(dir_select_frame, textvariable=self.output_dir)
  36. self.output_entry.pack(side=LEFT, fill=X, expand=True, padx=(0, 5))
  37. browse_btn = ttk.Button(dir_select_frame, text="浏览...", command=self.browse_output_dir)
  38. browse_btn.pack(side=RIGHT)
  39. # 信息显示区域
  40. info_frame = ttk.LabelFrame(main_frame, text="商品信息", padding="10")
  41. info_frame.pack(fill=BOTH, expand=True, pady=(0, 10))
  42. self.info_text = scrolledtext.ScrolledText(info_frame, height=8, font=("微软雅黑", 10))
  43. self.info_text.pack(fill=BOTH, expand=True)
  44. # 进度条
  45. self.progress_var = DoubleVar()
  46. self.progress_bar = ttk.Progressbar(main_frame, variable=self.progress_var, maximum=100)
  47. self.progress_bar.pack(fill=X, pady=(0, 10))
  48. # 状态标签
  49. self.status_label = ttk.Label(main_frame, text="就绪")
  50. self.status_label.pack(pady=(0, 10))
  51. # 按钮区域
  52. button_frame = ttk.Frame(main_frame)
  53. button_frame.pack(fill=X)
  54. # 启动按钮
  55. self.start_btn = ttk.Button(button_frame, text="▶ 启动下载", command=self.start_download, width=15)
  56. self.start_btn.pack(side=LEFT, padx=5)
  57. clear_btn = ttk.Button(button_frame, text="清空输入", command=self.clear_input)
  58. clear_btn.pack(side=LEFT, padx=5)
  59. # 退出按钮
  60. exit_btn = ttk.Button(button_frame, text="退出", command=self.root.quit)
  61. exit_btn.pack(side=RIGHT, padx=5)
  62. # 设置下载会话
  63. self.session = requests.Session()
  64. self.session.headers.update({
  65. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
  66. })
  67. def browse_output_dir(self):
  68. """选择输出目录"""
  69. directory = filedialog.askdirectory()
  70. if directory:
  71. self.output_dir.set(directory)
  72. def clear_input(self):
  73. """清空输入框"""
  74. self.input_text.delete(1.0, END)
  75. def update_info_display(self, product_info):
  76. """更新商品信息显示"""
  77. self.info_text.delete(1.0, END)
  78. info_str = f"""
  79. 【商品名称】{product_info.get('name', '未知')}
  80. 【商品描述】{product_info.get('description', '无')}
  81. 【价格】¥{product_info.get('min_price', '未知')}
  82. 【规格】{product_info.get('spec', '无')}
  83. 【品牌】{product_info.get('brand', '未知')}
  84. 【月销量】{product_info.get('month_saled', '未知')}
  85. """
  86. self.info_text.insert(1.0, info_str.strip())
  87. def extract_urls(self, data):
  88. """从JSON数据中提取图片和视频URL"""
  89. urls = {
  90. 'main_images': [], # 主图
  91. 'detail_images': [], # 详情图
  92. 'videos': [] # 视频
  93. }
  94. seen_urls = set()
  95. def add_url(url, url_type):
  96. """添加URL,避免重复"""
  97. if url and url not in seen_urls:
  98. seen_urls.add(url)
  99. if url_type == 'main':
  100. urls['main_images'].append(url)
  101. elif url_type == 'detail':
  102. urls['detail_images'].append(url)
  103. elif url_type == 'video':
  104. urls['videos'].append(url)
  105. try:
  106. # 从common_info中提取SKU主图
  107. if 'data' in data and 'ext' in data['data']:
  108. common_info = data['data']['ext'].get('common_info', {})
  109. # 提取商品名称和描述
  110. if 'skus' in common_info:
  111. for sku in common_info['skus']:
  112. if 'picture' in sku and sku['picture']:
  113. add_url(sku['picture'], 'main')
  114. # 从blocks中提取
  115. if 'blocks' in data['data']['ext']:
  116. blocks = data['data']['ext']['blocks']
  117. # 提取主图(头图)
  118. for block in blocks.get('banner', []):
  119. s_type = block.get('s_type', '')
  120. # 头图模块
  121. if s_type == 'sm_type_detail_pop_head_photo':
  122. head_info = block.get('data', {}).get('head_info', {})
  123. pictures = head_info.get('pictures', [])
  124. for pic in pictures:
  125. add_url(pic, 'main')
  126. # 商品详情图模块
  127. elif s_type == 'sm_type_goods_detail_describe_non_food':
  128. json_data = block.get('data', {}).get('json_data', {})
  129. pic_content = json_data.get('pic_content', {})
  130. contents = pic_content.get('contents', [])
  131. for content in contents:
  132. if content.startswith('http') and ('jpg' in content or 'png' in content or 'jpeg' in content):
  133. add_url(content, 'detail')
  134. # 推荐搭配模块中的图片
  135. elif s_type == 'sm_type_goods_detail_match_product':
  136. json_data = block.get('data', {}).get('json_data', {})
  137. collocate = json_data.get('collocate_recommend', {})
  138. spus = collocate.get('spus', [])
  139. for spu in spus:
  140. if 'picture' in spu and spu['picture']:
  141. add_url(spu['picture'], 'detail')
  142. # 从float模块中提取购物车相关图片
  143. for block in blocks.get('float', []):
  144. if block.get('s_type') == 'sm_type_cart_info':
  145. data_json = block.get('data', {})
  146. shopping_cart = data_json.get('shopping_cart', {})
  147. if shopping_cart.get('shopping_cart_icon'):
  148. add_url(shopping_cart['shopping_cart_icon'], 'detail')
  149. # 使用正则表达式提取所有图片URL作为补充
  150. data_str = json.dumps(data, ensure_ascii=False)
  151. # 匹配图片URL
  152. image_pattern = r'https?://[^\s"\']+\.(?:jpg|jpeg|png|gif|webp)(?:\?[^\s"\']*)?'
  153. all_images = re.findall(image_pattern, data_str, re.IGNORECASE)
  154. # 视频匹配
  155. video_pattern = r'https?://[^\s"\']+\.(?:mp4|mov|avi|flv|webm)(?:\?[^\s"\']*)?'
  156. all_videos = re.findall(video_pattern, data_str, re.IGNORECASE)
  157. # 分类正则匹配到的图片
  158. for img_url in all_images:
  159. if img_url in seen_urls:
  160. continue
  161. seen_urls.add(img_url)
  162. # 根据URL特征分类
  163. if 'head_photo' in img_url or 'main' in img_url.lower() or 'sku' in img_url.lower():
  164. urls['main_images'].append(img_url)
  165. else:
  166. urls['detail_images'].append(img_url)
  167. # 添加视频
  168. for video_url in all_videos:
  169. if video_url not in seen_urls:
  170. seen_urls.add(video_url)
  171. urls['videos'].append(video_url)
  172. except Exception as e:
  173. print(f"提取URL时出错: {e}")
  174. # 限制数量
  175. urls['main_images'] = urls['main_images'][:10]
  176. urls['detail_images'] = urls['detail_images'][:30]
  177. urls['videos'] = urls['videos'][:5]
  178. return urls
  179. def extract_product_info(self, data):
  180. """提取商品信息"""
  181. product_info = {
  182. 'name': '未命名商品',
  183. 'description': '无',
  184. 'min_price': '未知',
  185. 'spec': '无',
  186. 'brand': '未知',
  187. 'month_saled': '未知'
  188. }
  189. try:
  190. if 'data' in data and 'ext' in data['data']:
  191. common_info = data['data']['ext'].get('common_info', {})
  192. product_info['name'] = common_info.get('name', '未命名商品')
  193. product_info['description'] = common_info.get('description', '无')
  194. product_info['min_price'] = common_info.get('min_price', '未知')
  195. product_info['spec'] = common_info.get('sku_label', '无')
  196. # 提取月销量
  197. if 'skus' in common_info and common_info['skus']:
  198. sku = common_info['skus'][0]
  199. if 'promotion_info' in sku:
  200. match = re.search(r'月售(\d+)', sku.get('promotion_info', ''))
  201. if match:
  202. product_info['month_saled'] = match.group(1)
  203. # 提取品牌
  204. if 'name' in product_info and '农夫山泉' in product_info['name']:
  205. product_info['brand'] = '农夫山泉'
  206. elif 'name' in product_info and '可口可乐' in product_info['name']:
  207. product_info['brand'] = '可口可乐'
  208. elif 'name' in product_info and '百事' in product_info['name']:
  209. product_info['brand'] = '百事'
  210. except Exception as e:
  211. print(f"提取商品信息时出错: {e}")
  212. return product_info
  213. def download_file(self, url, filepath, timeout=30):
  214. """下载文件"""
  215. try:
  216. response = self.session.get(url, timeout=timeout, stream=True)
  217. if response.status_code == 200:
  218. with open(filepath, 'wb') as f:
  219. for chunk in response.iter_content(chunk_size=8192):
  220. if chunk:
  221. f.write(chunk)
  222. return True
  223. else:
  224. return False
  225. except Exception as e:
  226. print(f"下载失败 {url}: {e}")
  227. return False
  228. def sanitize_filename(self, name):
  229. """清理文件名中的非法字符"""
  230. # 移除非法字符
  231. name = re.sub(r'[<>:"/\\|?*]', '_', name)
  232. # 限制长度
  233. if len(name) > 100:
  234. name = name[:100]
  235. return name.strip()
  236. def download_resources(self, product_name, output_path, urls, progress_callback):
  237. """下载所有资源"""
  238. results = {
  239. 'success': True,
  240. 'downloaded': 0,
  241. 'total': 0,
  242. 'errors': []
  243. }
  244. # 创建子目录
  245. main_img_dir = output_path / "主图"
  246. detail_img_dir = output_path / "详情图"
  247. video_dir = output_path / "视频"
  248. main_img_dir.mkdir(exist_ok=True)
  249. detail_img_dir.mkdir(exist_ok=True)
  250. video_dir.mkdir(exist_ok=True)
  251. total_items = len(urls['main_images']) + len(urls['detail_images']) + len(urls['videos'])
  252. # 下载主图
  253. for i, url in enumerate(urls['main_images'], 1):
  254. results['total'] += 1
  255. # 获取文件扩展名
  256. path = urlparse(url).path
  257. ext = os.path.splitext(path)[1] or '.jpg'
  258. if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
  259. ext = '.jpg'
  260. filename = f"zhutu_{i}{ext}"
  261. filepath = main_img_dir / filename
  262. progress_callback(f"下载主图 {i}/{len(urls['main_images'])}...",
  263. int(results['total'] / total_items * 100))
  264. if self.download_file(url, filepath):
  265. results['downloaded'] += 1
  266. else:
  267. results['errors'].append(f"主图{i}: {url}")
  268. # 下载详情图
  269. for i, url in enumerate(urls['detail_images'], 1):
  270. results['total'] += 1
  271. path = urlparse(url).path
  272. ext = os.path.splitext(path)[1] or '.jpg'
  273. if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
  274. ext = '.jpg'
  275. filename = f"xiangqing_{i}{ext}"
  276. filepath = detail_img_dir / filename
  277. progress_callback(f"下载详情图 {i}/{len(urls['detail_images'])}...",
  278. int(results['total'] / total_items * 100))
  279. if self.download_file(url, filepath):
  280. results['downloaded'] += 1
  281. else:
  282. results['errors'].append(f"详情图{i}: {url}")
  283. # 下载视频
  284. for i, url in enumerate(urls['videos'], 1):
  285. results['total'] += 1
  286. path = urlparse(url).path
  287. ext = os.path.splitext(path)[1] or '.mp4'
  288. if ext not in ['.mp4', '.mov', '.avi', '.webm']:
  289. ext = '.mp4'
  290. filename = f"shipin_{i}{ext}"
  291. filepath = video_dir / filename
  292. progress_callback(f"下载视频 {i}/{len(urls['videos'])}...",
  293. int(results['total'] / total_items * 100))
  294. if self.download_file(url, filepath):
  295. results['downloaded'] += 1
  296. else:
  297. results['errors'].append(f"视频{i}: {url}")
  298. return results
  299. def start_download(self):
  300. """启动下载任务"""
  301. # 获取输入数据
  302. input_data = self.input_text.get(1.0, END).strip()
  303. if not input_data:
  304. messagebox.showerror("错误", "请输入商品数据")
  305. return
  306. # 检查输出目录
  307. output_dir = self.output_dir.get()
  308. if not output_dir:
  309. messagebox.showerror("错误", "请选择输出目录")
  310. return
  311. # 禁用启动按钮
  312. self.start_btn.config(state=DISABLED, text="⏳ 下载中...")
  313. self.progress_var.set(0)
  314. # 在新线程中执行下载
  315. thread = threading.Thread(target=self.download_task, args=(input_data, output_dir))
  316. thread.daemon = True
  317. thread.start()
  318. def download_task(self, input_data, output_dir):
  319. """下载任务"""
  320. try:
  321. # 解析JSON
  322. data = json.loads(input_data)
  323. # 提取商品信息
  324. product_info = self.extract_product_info(data)
  325. # 更新信息显示
  326. self.root.after(0, self.update_info_display, product_info)
  327. # 清理产品名作为文件夹名
  328. safe_name = self.sanitize_filename(product_info['name'])
  329. # 创建产品文件夹
  330. product_path = Path(output_dir) / safe_name
  331. product_path.mkdir(exist_ok=True)
  332. # 提取URL
  333. self.update_status_thread("正在分析数据,提取资源链接...")
  334. urls = self.extract_urls(data)
  335. total_urls = len(urls['main_images']) + len(urls['detail_images']) + len(urls['videos'])
  336. if total_urls == 0:
  337. self.root.after(0, messagebox.showwarning, "警告", "未找到任何图片或视频资源")
  338. self.update_status_thread("未找到资源")
  339. self.enable_button_thread()
  340. return
  341. self.update_status_thread(f"发现 {len(urls['main_images'])} 张主图, {len(urls['detail_images'])} 张详情图, {len(urls['videos'])} 个视频")
  342. # 定义进度回调
  343. def update_progress(msg, progress):
  344. self.update_status_thread(msg)
  345. self.root.after(0, self.progress_var.set, progress)
  346. # 下载资源
  347. results = self.download_resources(safe_name, product_path, urls, update_progress)
  348. # 更新进度到100%
  349. self.root.after(0, self.progress_var.set, 100)
  350. # 显示结果
  351. if results['downloaded'] > 0:
  352. msg = f"✅ 下载完成!\n\n成功下载 {results['downloaded']}/{results['total']} 个文件\n保存路径: {product_path}"
  353. if results['errors']:
  354. msg += f"\n\n⚠️ 失败 {len(results['errors'])} 个文件"
  355. self.root.after(0, messagebox.showinfo, "完成", msg)
  356. self.update_status_thread(f"下载完成 - 成功 {results['downloaded']}/{results['total']}")
  357. else:
  358. self.root.after(0, messagebox.showerror, "错误", "下载失败,没有成功下载任何文件")
  359. self.update_status_thread("下载失败")
  360. except json.JSONDecodeError as e:
  361. self.root.after(0, messagebox.showerror, "JSON解析错误", f"输入的不是有效的JSON格式:\n{str(e)}")
  362. self.update_status_thread("JSON解析失败")
  363. except Exception as e:
  364. self.root.after(0, messagebox.showerror, "错误", f"处理过程中出现错误:\n{str(e)}")
  365. self.update_status_thread("处理失败")
  366. finally:
  367. self.enable_button_thread()
  368. def update_status_thread(self, message):
  369. """从线程更新状态"""
  370. self.root.after(0, lambda: self.status_label.config(text=message))
  371. def enable_button_thread(self):
  372. """从线程启用按钮"""
  373. self.root.after(0, lambda: self.start_btn.config(state=NORMAL, text="▶ 启动下载"))
  374. def main():
  375. root = Tk()
  376. app = ProductDownloaderGUI(root)
  377. root.mainloop()
  378. if __name__ == "__main__":
  379. main()