  • 输入任意文章页面的URL,返回标题/作者/正文/发布时间/图片/面包屑等一系列信息
  • 支持异步加载文章页提取
  • 支持上传自定义的HTML代码提取正文
  • 支持自动检测网页编码
  • 支持自动提取网页全部URL
  • 在8个国家13万个新闻类网站进行测试,准确率高达90%


import requests
import json

url = "https://crawler.kingname.info/gne/crawl"

body = {
"url": "https://www.kingname.info/2023/10/17/rubbish/",
"js": False,
"charset": "auto"

headers = {
'token': '<TOKEN>',

response = requests.post(url, headers=headers, json=body)



{'title': '拒绝成为这样的程序员',
'publish_time_ts': 1697545236,
'publish_time': '2023-10-17 12:20:36',
'summary': '产品经理这两天在跟我抱怨他们公司的一个码农。听的我火冒三丈,差点把跟了我十多年的搪瓷水杯砸烂。. 正好在知识星球和微信群里面,有不少同学跟我咨询程序员的职业发展以及怎么应对三十岁危机。',
'canonical_url': 'https://www.kingname.info/2023/10/17/rubbish/',
'image_count': 0,
'image_urls': [],
'image_captions': [],
'author': [],
'timezone': 'UTC8',
'url': 'https://www.kingname.info/2023/10/17/rubbish/',
'amp_url': '',
}



"success": false,
"msg": "解析正文失败!"

那么,只需要把参数中的"js": false改为"js": true即可.例如:

 import requests
import json

url = "https://crawler.kingname.info/gne/crawl"

body = {
"url": "https://mp.weixin.qq.com/s/VObN8Ve8piv_I13fKbWhww",
"js": True,
"charset": "auto"

headers = {
'token': '<TOKEN>',

response = requests.post(url, headers=headers, json=body)




 import requests
import json

url = "https://crawler.kingname.info/gne/crawl"

body = {
"url": "https://www.kingname.info/archives/",
"js": false,
"charset": "auto",
"target": "link"

headers = {
'token': '<TOKEN>',

response = requests.post(url, headers=headers, json=body)



[{'url': 'https://www.kingname.info/', 'anchor': '谢乾坤 | Kingname'},
{'url': 'https://www.kingname.info/archives/', 'anchor': '归档'},
{'url': 'https://www.kingname.info/tags', 'anchor': '标签'},
{'url': 'https://www.kingname.info/mp', 'anchor': '公众号'},
{'url': 'https://www.kingname.info/about', 'anchor': '关于'},
{'url': 'https://github.com/kingname', 'anchor': 'GitHub'},
{'url': 'https://www.kingname.info/2023/11/29/jwt/',
'anchor': '一日一技:分布式系统的低成本权限校验机制'},
{'url': 'https://www.kingname.info/2023/11/15/git-worktree/',
'anchor': '一日一技:如何同时在多个分支写代码?'},
{'url': 'https://www.kingname.info/2023/11/14/typeddict/',
'anchor': '一日一技:警告但不禁止,遗留代码的优化策略'},
{'url': 'https://www.kingname.info/2023/11/11/gpts/',
'anchor': '老板让我加班怎么办?GPTs创建机器人实战'},
{'url': 'https://www.kingname.info/2023/11/11/python-run-other-code/',
'anchor': '一日一技:如何安全运行别人上传的Python代码?'},
{'url': 'https://www.kingname.info/2023/10/28/parse-json-object/',
'anchor': '一日一技:爬虫如何解析JavaScript Object?'},
{'url': 'https://www.kingname.info/2023/10/28/json-in-html/',
'anchor': '一日一技:HTML里面提取的JSON怎么解析不了?'},
{'url': 'https://www.kingname.info/2023/10/17/curl-cffi/',
'anchor': '一日一技:Requests被网站识别怎么办?'},
{'url': 'https://www.kingname.info/2023/10/17/rubbish/',
'anchor': '拒绝成为这样的程序员'},
{'url': 'https://www.kingname.info/2023/09/22/json-to-obj/',
'anchor': '一日一技:JSON如何快速转成对象?'},
{'url': 'https://www.kingname.info/archives/page/2/', 'anchor': '2'},
{'url': 'https://www.kingname.info/archives/page/23/', 'anchor': '23'},
{'url': 'https://hexo.io/', 'anchor': 'Hexo'},
]



 with open('/Users/kingname/Downloads/okx.html') as f:
html = f.read()

import requests
import time

body = {
'html': html,
'url': 'https://www.okx.com/learn/curve-finance-guide',
'fetch_time': int(time.time()),
'charset': 'utf-8'}
resp = requests.post('https://crawler.kingname.info/gne/crawl_html',
headers={'token': '<TOKEN>'})




'summary': 'Ethereum’s network is the home to a wide range of unique projects. Ever since it created the ERC-20 token model, thousands of tokens have been launched. In time, new products have emerged as well, such as dApps, decentralized finance (DeFi) protocols and decentralized exchanges (DEXes). One example of a...',
'canonical_url': 'https://www.okx.com/learn/curve-finance-guide',
'image_count': 2,
'image_urls': ['https://static.okx.com/cdn/assets/plugins/contentful/4nqoo8goeymu/77FafUZE1sjddle2cvbVwT/a02397f818dac7686303ee8da38afad0/Crv.jpg',
'image_captions': ['', ''],
'author': ['OKX'],
'timezone': '',
'url': 'https://www.okx.com/learn/curve-finance-guide',
'amp_url': '',
'breadcrumb': [{'level': 0,
'text': 'Learn',
'link': 'https://www.okx.com/learn'},
{'level': 1,
'text': 'Glossary',
'link': 'https://www.okx.com/learn/category/blockchain-glossary'},
{'level': 2, 'text': 'Article', 'link': ''}]}



 <!DOCTYPE html>
<html lang="en">
<meta charset="UTF-8">




  • title: 新闻标题
  • publish_time_ts: 发布时间的时间戳,精确到秒
  • publish_time: 发布时间,格式为YYYY-mm-dd HH:MM:SS
  • content: 纯文本形式的正文(段落会被压缩)
  • clean_content: 精简以后的正文HTML
  • content_list: 按段落和图片划分的正文列表.使用这个字段你可以很方便地把正文和图片的相对位置去分开
  • summary:正文的前300个字符
  • image_count: 正文中图片的数量
  • image_urls: 正文中图片的url列表
  • image_captions: 正文图片的标题列表
  • head_meta: html中的元信息
  • author: 作者
  • url: 页面的url.如果页面经过多次跳转,这个url为最终url
  • breadcrumb: 面包屑



