浏览代码

就站文章爬取

master
leiyun 3 周前
父节点
当前提交
a84ca2ea57
共有 7 个文件被更改,包括 703 次插入0 次删除
  1. +2
    -0
      package.json
  2. +221
    -0
      pnpm-lock.yaml
  3. +21
    -0
      sql/update_20260224_spider_news.sql
  4. +8
    -0
      src/config/router.js
  5. +439
    -0
      src/controller/admin/spider.js
  6. +7
    -0
      src/controller/base.js
  7. +5
    -0
      src/model/spider_news.js

+ 2
- 0
package.json 查看文件

@@ -10,6 +10,8 @@
"lint-fix": "eslint --fix src/"
},
"dependencies": {
"axios": "^1.13.5",
"cheerio": "^1.2.0",
"cos-nodejs-sdk-v5": "^2.14.0",
"jsonwebtoken": "^9.0.3",
"sharp": "^0.33.0",


+ 221
- 0
pnpm-lock.yaml 查看文件

@@ -8,6 +8,12 @@ importers:

.:
dependencies:
axios:
specifier: ^1.13.5
version: 1.13.5
cheerio:
specifier: ^1.2.0
version: 1.2.0
cos-nodejs-sdk-v5:
specifier: ^2.14.0
version: 2.15.4
@@ -471,6 +477,9 @@ packages:
aws4@1.13.2:
resolution: {integrity: sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==}

axios@1.13.5:
resolution: {integrity: sha512-cz4ur7Vb0xS4/KUN0tPWe44eqxrIu31me+fbang3ijiNscE129POzipJJA6zniq2C/Z6sJCjMimjS8Lc/GAs8Q==}

babel-code-frame@6.26.0:
resolution: {integrity: sha512-XqYMR2dfdGMW+hd0IUZ2PwK+fGeFkOxZJ0wY+JaQAHzt1Zx8LcvpiZD2NiGkEG8qx0CfkAOr5xt76d1e8vG90g==}

@@ -608,6 +617,9 @@ packages:
bluebird@3.7.2:
resolution: {integrity: sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==}

boolbase@1.0.0:
resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}

boxen@0.6.0:
resolution: {integrity: sha512-yL8sYzt0avlYGOY6LqtECkGrJOY3cCLAbFPaNfgE+4fD45ZrdYqLdY8yF4bqyTkpfW9e6W0YqBkN7dIn/PrZoA==}
engines: {node: '>=0.10.0'}
@@ -706,6 +718,13 @@ packages:
chardet@0.4.2:
resolution: {integrity: sha512-j/Toj7f1z98Hh2cYo2BVr85EpIRWqUi7rtRSGxh/cqUjqrnJe9l9UE7IUGd2vQ2p+kSHLkSzObQPZPLUC6TQwg==}

cheerio-select@2.1.0:
resolution: {integrity: sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==}

cheerio@1.2.0:
resolution: {integrity: sha512-WDrybc/gKFpTYQutKIK6UvfcuxijIZfMfXaYm8NMsPQxSYvf+13fXUJ4rztGGbJcBQ/GF55gvrZ0Bc0bj/mqvg==}
engines: {node: '>=20.18.1'}

chokidar@1.7.0:
resolution: {integrity: sha512-mk8fAWcRUOxY7btlLtitj3A45jOwSAxH4tOFOoEGbVsl6cL6pPMWUy7dwZ/canfj3QEdP6FHSnf/l1c6/WkzVg==}

@@ -879,6 +898,13 @@ packages:
cross-spawn@5.1.0:
resolution: {integrity: sha512-pTgQJ5KC0d2hcY8eyL1IzlBPYjTkyH72XRZPnLyKus2mBfNjQs3klqbJU2VILqZryAZUt9JOb3h/mWMy23/f5A==}

css-select@5.2.2:
resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==}

css-what@6.2.2:
resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==}
engines: {node: '>= 6'}

currently-unhandled@0.4.1:
resolution: {integrity: sha512-/fITjgjGU50vjQ4FH6eUoYu+iUoUKIXws2hL15JJpIR+BbTxaXQsMuuyjtNh2WqsSBS5nsaZHFsFecyw5CCAng==}
engines: {node: '>=0.10.0'}
@@ -1017,6 +1043,19 @@ packages:
resolution: {integrity: sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==}
engines: {node: '>=0.10.0'}

dom-serializer@2.0.0:
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}

domelementtype@2.3.0:
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}

domhandler@5.0.3:
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
engines: {node: '>= 4'}

domutils@3.2.2:
resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==}

dot-prop@3.0.0:
resolution: {integrity: sha512-k4ELWeEU3uCcwub7+dWydqQBRjAjkV9L33HjVRG5Xo2QybI6ja/v+4W73SRi8ubCqJz0l9XsTP1NbewfyqaSlw==}
engines: {node: '>=0.10.0'}
@@ -1055,6 +1094,21 @@ packages:
resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==}
engines: {node: '>= 0.8'}

encoding-sniffer@0.2.1:
resolution: {integrity: sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==}

entities@4.5.0:
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
engines: {node: '>=0.12'}

entities@6.0.1:
resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==}
engines: {node: '>=0.12'}

entities@7.0.1:
resolution: {integrity: sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA==}
engines: {node: '>=0.12'}

env-paths@2.2.1:
resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==}
engines: {node: '>=6'}
@@ -1323,6 +1377,15 @@ packages:
resolution: {integrity: sha512-oIDB1rXf3BUnn00bh2jVM0byuqr94rBh6g7ZfdKcbmp1we2GQtPzKdloyvBXHs+q3fvxB8EqX5ecFba3RwCSjA==}
engines: {node: '>=0.10.0'}

follow-redirects@1.15.11:
resolution: {integrity: sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==}
engines: {node: '>=4.0'}
peerDependencies:
debug: '*'
peerDependenciesMeta:
debug:
optional: true

for-each@0.3.5:
resolution: {integrity: sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg==}
engines: {node: '>= 0.4'}
@@ -1342,6 +1405,10 @@ packages:
resolution: {integrity: sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==}
engines: {node: '>= 0.12'}

form-data@4.0.5:
resolution: {integrity: sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==}
engines: {node: '>= 6'}

formidable@1.2.6:
resolution: {integrity: sha512-KcpbcpuLNOwrEjnbpMC0gS+X8ciDoZE1kkqzat4a8vrprf+s9pKNQ/QIwWfbfs4ltgmFl3MD177SNTkve3BwGQ==}
deprecated: 'Please upgrade to latest, formidable@v2 or formidable@v3! Check these notes: https://bit.ly/2ZEqIau'
@@ -1528,6 +1595,9 @@ packages:
hosted-git-info@2.8.9:
resolution: {integrity: sha512-mxIDAb9Lsm6DoOJ7xH+5+X4y1LU/4Hi50L9C5sIswK3JzULS4bwk1FvjdBgvYR4bzT4tuUQiC15FE2f5HbLvYw==}

htmlparser2@10.1.0:
resolution: {integrity: sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ==}

http-assert@1.5.0:
resolution: {integrity: sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==}
engines: {node: '>= 0.8'}
@@ -1552,6 +1622,10 @@ packages:
resolution: {integrity: sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==}
engines: {node: '>=0.10.0'}

iconv-lite@0.6.3:
resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==}
engines: {node: '>=0.10.0'}

ignore-by-default@1.0.1:
resolution: {integrity: sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==}

@@ -2269,6 +2343,9 @@ packages:
resolution: {integrity: sha512-lJxZYlT4DW/bRUtFh1MQIWqmLwQfAxnqWG4HhEdjMlkrJYnJn0Jrr2u3mgxqaWsdiBc76TYkTG/mhrnYTuzfHw==}
engines: {node: '>=4'}

nth-check@2.1.1:
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}

number-is-nan@1.0.1:
resolution: {integrity: sha512-4jbtZXNAsfZbAHiiqjLPBiCl16dES1zI4Hpzzxw61Tk+loF+sBDBKx1ICKKKwIqQ7M0mFn1TmkN7euSncWgHiQ==}
engines: {node: '>=0.10.0'}
@@ -2465,6 +2542,15 @@ packages:
resolution: {integrity: sha512-LpH1Cf5EYuVjkBvCDBYvkUPh+iv2bk3FHflxHkpCYT0/FZ1d3N3uJaLiHr4yGuMcFUhv6eAivitTvWZI4B/chg==}
engines: {node: '>=0.10.0'}

parse5-htmlparser2-tree-adapter@7.1.0:
resolution: {integrity: sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==}

parse5-parser-stream@7.1.2:
resolution: {integrity: sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==}

parse5@7.3.0:
resolution: {integrity: sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==}

parseurl@1.3.3:
resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==}
engines: {node: '>= 0.8'}
@@ -2603,6 +2689,9 @@ packages:
resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==}
engines: {node: '>=0.4.0'}

proxy-from-env@1.1.0:
resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==}

pseudomap@1.0.2:
resolution: {integrity: sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ==}

@@ -3332,6 +3421,10 @@ packages:
undici-types@7.16.0:
resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==}

undici@7.22.0:
resolution: {integrity: sha512-RqslV2Us5BrllB+JeiZnK4peryVTndy9Dnqq62S3yYRRTj0tFQCwEniUy2167skdGOy3vqRzEvl1Dm4sV2ReDg==}
engines: {node: '>=20.18.1'}

union-value@1.0.1:
resolution: {integrity: sha512-tJfXmxMeWYnczCVs7XAEvIV7ieppALdyepWMkHkwciRpZraG/xwT+s2JN8+pr1+8jCRf80FFzvr+MpQeeoF4Xg==}
engines: {node: '>=0.10.0'}
@@ -3406,6 +3499,14 @@ packages:
resolution: {integrity: sha512-ZZKSmDAEFOijERBLkmYfJ+vmk3w+7hOLYDNkRCuRuMJGEmqYNCNLyBBFwWKVMhfwaEF3WOd0Zlw86U/WC/+nYw==}
engines: {'0': node >=0.6.0}

whatwg-encoding@3.1.1:
resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==}
engines: {node: '>=18'}

whatwg-mimetype@4.0.0:
resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==}
engines: {node: '>=18'}

which-boxed-primitive@1.1.1:
resolution: {integrity: sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==}
engines: {node: '>= 0.4'}
@@ -3934,6 +4035,14 @@ snapshots:

aws4@1.13.2: {}

axios@1.13.5:
dependencies:
follow-redirects: 1.15.11
form-data: 4.0.5
proxy-from-env: 1.1.0
transitivePeerDependencies:
- debug

babel-code-frame@6.26.0:
dependencies:
chalk: 1.1.3
@@ -4232,6 +4341,8 @@ snapshots:

bluebird@3.7.2: {}

boolbase@1.0.0: {}

boxen@0.6.0:
dependencies:
ansi-align: 1.1.0
@@ -4366,6 +4477,29 @@ snapshots:

chardet@0.4.2: {}

cheerio-select@2.1.0:
dependencies:
boolbase: 1.0.0
css-select: 5.2.2
css-what: 6.2.2
domelementtype: 2.3.0
domhandler: 5.0.3
domutils: 3.2.2

cheerio@1.2.0:
dependencies:
cheerio-select: 2.1.0
dom-serializer: 2.0.0
domhandler: 5.0.3
domutils: 3.2.2
encoding-sniffer: 0.2.1
htmlparser2: 10.1.0
parse5: 7.3.0
parse5-htmlparser2-tree-adapter: 7.1.0
parse5-parser-stream: 7.1.2
undici: 7.22.0
whatwg-mimetype: 4.0.0

chokidar@1.7.0:
dependencies:
anymatch: 1.3.2
@@ -4554,6 +4688,16 @@ snapshots:
shebang-command: 1.2.0
which: 1.3.1

css-select@5.2.2:
dependencies:
boolbase: 1.0.0
css-what: 6.2.2
domhandler: 5.0.3
domutils: 3.2.2
nth-check: 2.1.1

css-what@6.2.2: {}

currently-unhandled@0.4.1:
dependencies:
array-find-index: 1.0.2
@@ -4668,6 +4812,24 @@ snapshots:
dependencies:
esutils: 2.0.3

dom-serializer@2.0.0:
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
entities: 4.5.0

domelementtype@2.3.0: {}

domhandler@5.0.3:
dependencies:
domelementtype: 2.3.0

domutils@3.2.2:
dependencies:
dom-serializer: 2.0.0
domelementtype: 2.3.0
domhandler: 5.0.3

dot-prop@3.0.0:
dependencies:
is-obj: 1.0.1
@@ -4710,6 +4872,17 @@ snapshots:

encodeurl@1.0.2: {}

encoding-sniffer@0.2.1:
dependencies:
iconv-lite: 0.6.3
whatwg-encoding: 3.1.1

entities@4.5.0: {}

entities@6.0.1: {}

entities@7.0.1: {}

env-paths@2.2.1: {}

equal-length@1.0.1: {}
@@ -5101,6 +5274,8 @@ snapshots:

fn-name@2.0.1: {}

follow-redirects@1.15.11: {}

for-each@0.3.5:
dependencies:
is-callable: 1.2.7
@@ -5119,6 +5294,14 @@ snapshots:
combined-stream: 1.0.8
mime-types: 2.1.35

form-data@4.0.5:
dependencies:
asynckit: 0.4.0
combined-stream: 1.0.8
es-set-tostringtag: 2.1.0
hasown: 2.0.2
mime-types: 2.1.35

formidable@1.2.6: {}

fragment-cache@0.2.1:
@@ -5320,6 +5503,13 @@ snapshots:

hosted-git-info@2.8.9: {}

htmlparser2@10.1.0:
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
domutils: 3.2.2
entities: 7.0.1

http-assert@1.5.0:
dependencies:
deep-equal: 1.0.1
@@ -5358,6 +5548,10 @@ snapshots:
dependencies:
safer-buffer: 2.1.2

iconv-lite@0.6.3:
dependencies:
safer-buffer: 2.1.2

ignore-by-default@1.0.1: {}

ignore@3.3.10: {}
@@ -6130,6 +6324,10 @@ snapshots:
dependencies:
path-key: 2.0.1

nth-check@2.1.1:
dependencies:
boolbase: 1.0.0

number-is-nan@1.0.1: {}

nunjucks@3.2.4:
@@ -6305,6 +6503,19 @@ snapshots:

parse-ms@1.0.1: {}

parse5-htmlparser2-tree-adapter@7.1.0:
dependencies:
domhandler: 5.0.3
parse5: 7.3.0

parse5-parser-stream@7.1.2:
dependencies:
parse5: 7.3.0

parse5@7.3.0:
dependencies:
entities: 6.0.1

parseurl@1.3.3: {}

pascalcase@0.1.1: {}
@@ -6408,6 +6619,8 @@ snapshots:

progress@2.0.3: {}

proxy-from-env@1.1.0: {}

pseudomap@1.0.2: {}

psl@1.15.0:
@@ -7368,6 +7581,8 @@ snapshots:

undici-types@7.16.0: {}

undici@7.22.0: {}

union-value@1.0.1:
dependencies:
arr-union: 3.1.0
@@ -7438,6 +7653,12 @@ snapshots:
core-util-is: 1.0.2
extsprintf: 1.3.0

whatwg-encoding@3.1.1:
dependencies:
iconv-lite: 0.6.3

whatwg-mimetype@4.0.0: {}

which-boxed-primitive@1.1.1:
dependencies:
is-bigint: 1.1.0


+ 21
- 0
sql/update_20260224_spider_news.sql 查看文件

@@ -0,0 +1,21 @@
-- 爬虫-新闻数据中间表
CREATE TABLE IF NOT EXISTS `pap_spider_news` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`origin_id` int(11) NOT NULL DEFAULT 0 COMMENT '原站文章ID',
`title` varchar(500) NOT NULL DEFAULT '' COMMENT '文章标题',
`category` varchar(100) DEFAULT '' COMMENT '原站栏目分类',
`publish_time` datetime DEFAULT NULL COMMENT '原站发布时间',
`content` longtext COMMENT '处理后的正文(图片已转存)',
`content_original` longtext COMMENT '原始正文HTML(未处理图片)',
`cover` varchar(500) DEFAULT '' COMMENT '封面图(转存后)',
`cover_original` varchar(500) DEFAULT '' COMMENT '原始封面图URL',
`images` text COMMENT '图片映射JSON [{original, new}]',
`status` tinyint(1) DEFAULT 0 COMMENT '0待爬取 1已爬取详情 2图片已转存 3已同步到article -1失败',
`article_id` int(11) DEFAULT 0 COMMENT '同步到article表后的ID',
`error_msg` varchar(1000) DEFAULT '' COMMENT '错误信息',
`create_time` datetime DEFAULT CURRENT_TIMESTAMP,
`update_time` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uk_origin_id` (`origin_id`),
KEY `idx_status` (`status`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='爬虫-新闻数据';

+ 8
- 0
src/config/router.js 查看文件

@@ -108,6 +108,14 @@ module.exports = [
['/admin/content/person/edit', 'admin/content/person/edit', 'post'],
['/admin/content/person/delete', 'admin/content/person/delete', 'post'],

// 爬虫工具
['/admin/spider/fetch-list', 'admin/spider/fetchList'],
['/admin/spider/fetch-detail', 'admin/spider/fetchDetail'],
['/admin/spider/transfer-images', 'admin/spider/transferImages'],
['/admin/spider/sync', 'admin/spider/sync'],
['/admin/spider/status', 'admin/spider/status'],
['/admin/spider/retry', 'admin/spider/retry'],

// 内容管理 - 捐赠收支
['/admin/content/donation', 'admin/content/donation/index'],
['/admin/content/donation/stat', 'admin/content/donation/stat'],


+ 439
- 0
src/controller/admin/spider.js 查看文件

@@ -0,0 +1,439 @@
const Base = require('../base');
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
const COS = require('cos-nodejs-sdk-v5');
const cosConfig = require('../../config/cos.js');

// 原站配置
const ORIGIN = {
base: 'http://www.vkfoundation.cn',
listUrl: 'http://www.vkfoundation.cn/manage/function.php',
cookie: 'PHPSESSID=kjcj0j7qtdlqhm0rdqie67qnh6; zzz811_token=0c091635c6decb9333232a08bef265cf; zzz904_adminpass=0; zzz904_adminname=vkadmin; zzz904_admintime=1771944843; zzz904_adminface=..%2Fplugins%2Fface%2Fface01.png'
};

// 请求间隔(毫秒)
const DELAY = 500;
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));

module.exports = class extends Base {

/**
* 步骤1:拉取列表并入库
* GET /admin/spider/fetch-list
*/
async fetchListAction() {
try {
const res = await axios.post(
`${ORIGIN.listUrl}?act=contentlist&id=&type=news`,
'order=asc&limit=9999&offset=0',
{
headers: {
Cookie: ORIGIN.cookie,
'Content-Type': 'application/x-www-form-urlencoded'
}
}
);

const data = res.data;
if (!data || !data.rows) {
return this.json({ code: 1, msg: '列表数据获取失败' });
}

const model = this.model('spider_news');
let inserted = 0;
let skipped = 0;

for (const row of data.rows) {
const originId = parseInt(row.id);
// 从HTML片段中提取数据
const title = this.extractTitle(row.c_title);
const category = this.extractCategory(row.c_sid);
const publishTime = this.extractTime(row.c_addtime);

// 检查是否已存在
const exists = await model.where({ origin_id: originId }).find();
if (exists && exists.id) {
skipped++;
continue;
}

await model.add({
origin_id: originId,
title,
category,
publish_time: publishTime,
status: 0
});
inserted++;
}

return this.json({
code: 0,
msg: `列表拉取完成,新增${inserted}条,跳过${skipped}条`,
data: { total: data.total, inserted, skipped }
});
} catch (err) {
think.logger.error('拉取列表失败:', err);
return this.json({ code: 1, msg: '拉取列表失败: ' + err.message });
}
}

/**
* 步骤2:爬取详情页内容
* GET /admin/spider/fetch-detail
* 参数: id (可选,指定单条;不传则处理所有status=0的)
*/
async fetchDetailAction() {
const targetId = this.get('id');
const model = this.model('spider_news');

let list;
if (targetId) {
list = await model.where({ origin_id: targetId }).select();
} else {
list = await model.where({ status: 0 }).select();
}

if (!list.length) {
return this.json({ code: 0, msg: '没有待爬取的记录' });
}

let success = 0;
let fail = 0;

for (const item of list) {
try {
const detailUrl = `${ORIGIN.base}/?news/${item.origin_id}.html`;
const res = await axios.get(detailUrl, {
headers: { Cookie: ORIGIN.cookie },
responseType: 'arraybuffer'
});

// 处理编码
const html = res.data.toString('utf-8');
const $ = cheerio.load(html);

// 从截图看,内容在 div.maximg 中
const contentEl = $('div.maximg');
let contentHtml = '';
if (contentEl.length) {
contentHtml = contentEl.html() || '';
} else {
// 备选:尝试 .picture-7 容器
const pictureEl = $('div.picture-7');
contentHtml = pictureEl.html() || '';
}

// 提取封面图(取正文中第一张图)
let coverOriginal = '';
const firstImg = contentEl.find('img').first();
if (firstImg.length) {
coverOriginal = firstImg.attr('src') || '';
if (coverOriginal && !coverOriginal.startsWith('http')) {
coverOriginal = ORIGIN.base + (coverOriginal.startsWith('/') ? '' : '/') + coverOriginal;
}
}

await model.where({ id: item.id }).update({
content_original: contentHtml,
cover_original: coverOriginal,
status: 1,
error_msg: ''
});
success++;
await sleep(DELAY);
} catch (err) {
think.logger.error(`爬取详情失败 origin_id=${item.origin_id}:`, err);
await model.where({ id: item.id }).update({
status: -1,
error_msg: err.message
});
fail++;
}
}

return this.json({
code: 0,
msg: `详情爬取完成,成功${success}条,失败${fail}条`,
data: { success, fail }
});
}

/**
* 步骤3:转存图片
* GET /admin/spider/transfer-images
* 参数: id (可选,指定单条;不传则处理所有status=1的)
*/
async transferImagesAction() {
const targetId = this.get('id');
const model = this.model('spider_news');

let list;
if (targetId) {
list = await model.where({ origin_id: targetId }).select();
} else {
list = await model.where({ status: 1 }).select();
}

if (!list.length) {
return this.json({ code: 0, msg: '没有待转存图片的记录' });
}

const cos = new COS({
SecretId: cosConfig.secretId,
SecretKey: cosConfig.secretKey
});

let success = 0;
let fail = 0;

for (const item of list) {
try {
const $ = cheerio.load(item.content_original, null, false);
const imgElements = $('img');
const imageMap = [];

for (let i = 0; i < imgElements.length; i++) {
const img = imgElements.eq(i);
let src = img.attr('src') || '';
if (!src) continue;

// 补全URL
const originalUrl = src.startsWith('http') ? src : ORIGIN.base + (src.startsWith('/') ? '' : '/') + src;

try {
const newUrl = await this.downloadAndUpload(cos, originalUrl);
img.attr('src', newUrl);
imageMap.push({ original: originalUrl, new: newUrl });
} catch (imgErr) {
think.logger.warn(`图片转存失败 ${originalUrl}:`, imgErr.message);
imageMap.push({ original: originalUrl, new: '', error: imgErr.message });
}
await sleep(300);
}

// 转存封面图
let cover = '';
if (item.cover_original) {
try {
cover = await this.downloadAndUpload(cos, item.cover_original);
} catch (coverErr) {
think.logger.warn(`封面图转存失败:`, coverErr.message);
}
}

await model.where({ id: item.id }).update({
content: $.html(),
cover,
images: JSON.stringify(imageMap),
status: 2,
error_msg: ''
});
success++;
} catch (err) {
think.logger.error(`图片转存失败 origin_id=${item.origin_id}:`, err);
await model.where({ id: item.id }).update({
status: -1,
error_msg: '图片转存失败: ' + err.message
});
fail++;
}
}

return this.json({
code: 0,
msg: `图片转存完成,成功${success}条,失败${fail}条`,
data: { success, fail }
});
}

/**
* 下载图片并上传到COS
*/
async downloadAndUpload(cos, imageUrl) {
// 下载图片
const res = await axios.get(imageUrl, {
responseType: 'arraybuffer',
headers: { Cookie: ORIGIN.cookie },
timeout: 15000
});

const buffer = Buffer.from(res.data);
const contentType = res.headers['content-type'] || 'image/jpeg';

// 生成COS路径
const now = new Date();
const dateFolder = `${now.getFullYear()}/${String(now.getMonth() + 1).padStart(2, '0')}/${String(now.getDate()).padStart(2, '0')}`;
const ext = this.getExtFromContentType(contentType);
const fileName = `spider_${Date.now()}_${Math.random().toString(36).slice(2, 8)}${ext}`;
const cosKey = `uploads/${dateFolder}/${fileName}`;

// 上传到COS
await new Promise((resolve, reject) => {
cos.putObject({
Bucket: cosConfig.bucket,
Region: cosConfig.region,
Key: cosKey,
Body: buffer,
ContentType: contentType
}, (err, data) => {
if (err) reject(err);
else resolve(data);
});
});

return `${cosConfig.cdnUrl}/${cosKey}`;
}

getExtFromContentType(contentType) {
const map = {
'image/jpeg': '.jpg',
'image/jpg': '.jpg',
'image/png': '.png',
'image/gif': '.gif',
'image/webp': '.webp',
'image/bmp': '.bmp'
};
return map[contentType] || '.jpg';
}

/**
* 步骤4:同步到article表
* GET /admin/spider/sync
* 参数: col (必填,目标栏目key)
* 参数: id (可选,指定单条origin_id)
*/
async syncAction() {
const col = this.get('col');
if (!col) {
return this.json({ code: 1, msg: '请指定目标栏目col参数' });
}

const column = await this.model('column').where({ key: col, is_deleted: 0 }).find();
if (!column || !column.id) {
return this.json({ code: 1, msg: '栏目不存在: ' + col });
}

const targetId = this.get('id');
const model = this.model('spider_news');

let list;
if (targetId) {
list = await model.where({ origin_id: targetId, status: 2 }).select();
} else {
list = await model.where({ status: 2 }).select();
}

if (!list.length) {
return this.json({ code: 0, msg: '没有待同步的记录' });
}

const articleModel = this.model('article');
let success = 0;
let fail = 0;

for (const item of list) {
try {
const articleId = await articleModel.add({
column_id: column.id,
title: item.title,
summary: '',
content: item.content || item.content_original || '',
cover: item.cover || '',
category: item.category || '',
is_top: 0,
is_recommend: 0,
sort: 0,
status: 1,
publish_time: item.publish_time
});

await model.where({ id: item.id }).update({
article_id: articleId,
status: 3
});
success++;
} catch (err) {
think.logger.error(`同步失败 origin_id=${item.origin_id}:`, err);
await model.where({ id: item.id }).update({
error_msg: '同步失败: ' + err.message
});
fail++;
}
}

return this.json({
code: 0,
msg: `同步完成,成功${success}条,失败${fail}条`,
data: { success, fail }
});
}

/**
* 查看爬取状态
* GET /admin/spider/status
*/
async statusAction() {
const model = this.model('spider_news');
const total = await model.count();
const pending = await model.where({ status: 0 }).count();
const detailed = await model.where({ status: 1 }).count();
const transferred = await model.where({ status: 2 }).count();
const synced = await model.where({ status: 3 }).count();
const failed = await model.where({ status: -1 }).count();

return this.json({
code: 0,
data: {
total,
pending, // 待爬取详情
detailed, // 已爬详情,待转存图片
transferred, // 图片已转存,待同步
synced, // 已同步
failed // 失败
}
});
}

/**
* 重试失败的记录
* GET /admin/spider/retry
* 参数: step (detail|images) 重试哪一步
*/
async retryAction() {
const step = this.get('step');
const model = this.model('spider_news');

if (step === 'detail') {
await model.where({ status: -1 }).update({ status: 0, error_msg: '' });
} else if (step === 'images') {
await model.where({ status: -1 }).update({ status: 1, error_msg: '' });
} else {
return this.json({ code: 1, msg: '请指定step参数: detail 或 images' });
}

return this.json({ code: 0, msg: '已重置失败记录' });
}

// ---- 辅助方法:从HTML片段提取数据 ----

extractTitle(html) {
if (!html) return '';
const match = html.match(/(?:value|placeholder)="([^"]+)"/);
return match ? match[1] : '';
}

extractCategory(html) {
if (!html) return '';
const match = html.match(/>([^<]+)</);
return match ? match[1] : '';
}

extractTime(html) {
if (!html) return null;
const match = html.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})/);
return match ? match[1] : null;
}
};

+ 7
- 0
src/controller/base.js 查看文件

@@ -10,6 +10,13 @@ const WHITE_LIST = [
'/admin/login',
'/admin/auth/login',
'/admin/logout',
// 爬虫工具
'/admin/spider/fetch-list',
'/admin/spider/fetch-detail',
'/admin/spider/transfer-images',
'/admin/spider/sync',
'/admin/spider/status',
'/admin/spider/retry',
];

// 前台路由前缀(全部放行)


+ 5
- 0
src/model/spider_news.js 查看文件

@@ -0,0 +1,5 @@
module.exports = class extends think.Model {
get tableName() {
return 'pap_spider_news';
}
};

正在加载...
取消
保存