From a84ca2ea57eb37e901031d0639c0fc1048ef8dd8 Mon Sep 17 00:00:00 2001 From: leiyun Date: Tue, 24 Feb 2026 23:54:38 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=B1=E7=AB=99=E6=96=87=E7=AB=A0=E7=88=AC?= =?UTF-8?q?=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 + pnpm-lock.yaml | 221 ++++++++++++++ sql/update_20260224_spider_news.sql | 21 ++ src/config/router.js | 8 + src/controller/admin/spider.js | 439 ++++++++++++++++++++++++++++ src/controller/base.js | 7 + src/model/spider_news.js | 5 + 7 files changed, 703 insertions(+) create mode 100644 sql/update_20260224_spider_news.sql create mode 100644 src/controller/admin/spider.js create mode 100644 src/model/spider_news.js diff --git a/package.json b/package.json index bd74538..89759b5 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,8 @@ "lint-fix": "eslint --fix src/" }, "dependencies": { + "axios": "^1.13.5", + "cheerio": "^1.2.0", "cos-nodejs-sdk-v5": "^2.14.0", "jsonwebtoken": "^9.0.3", "sharp": "^0.33.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7769980..ec75431 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,6 +8,12 @@ importers: .: dependencies: + axios: + specifier: ^1.13.5 + version: 1.13.5 + cheerio: + specifier: ^1.2.0 + version: 1.2.0 cos-nodejs-sdk-v5: specifier: ^2.14.0 version: 2.15.4 @@ -471,6 +477,9 @@ packages: aws4@1.13.2: resolution: {integrity: sha512-lHe62zvbTB5eEABUVi/AwVh0ZKY9rMMDhmm+eeyuuUQbQ3+J+fONVQOZyj+DdrvD4BY33uYniyRJ4UJIaSKAfw==} + axios@1.13.5: + resolution: {integrity: sha512-cz4ur7Vb0xS4/KUN0tPWe44eqxrIu31me+fbang3ijiNscE129POzipJJA6zniq2C/Z6sJCjMimjS8Lc/GAs8Q==} + babel-code-frame@6.26.0: resolution: {integrity: sha512-XqYMR2dfdGMW+hd0IUZ2PwK+fGeFkOxZJ0wY+JaQAHzt1Zx8LcvpiZD2NiGkEG8qx0CfkAOr5xt76d1e8vG90g==} @@ -608,6 +617,9 @@ packages: bluebird@3.7.2: resolution: {integrity: sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==} + boolbase@1.0.0: + resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} + boxen@0.6.0: resolution: {integrity: sha512-yL8sYzt0avlYGOY6LqtECkGrJOY3cCLAbFPaNfgE+4fD45ZrdYqLdY8yF4bqyTkpfW9e6W0YqBkN7dIn/PrZoA==} engines: {node: '>=0.10.0'} @@ -706,6 +718,13 @@ packages: chardet@0.4.2: resolution: {integrity: sha512-j/Toj7f1z98Hh2cYo2BVr85EpIRWqUi7rtRSGxh/cqUjqrnJe9l9UE7IUGd2vQ2p+kSHLkSzObQPZPLUC6TQwg==} + cheerio-select@2.1.0: + resolution: {integrity: sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==} + + cheerio@1.2.0: + resolution: {integrity: sha512-WDrybc/gKFpTYQutKIK6UvfcuxijIZfMfXaYm8NMsPQxSYvf+13fXUJ4rztGGbJcBQ/GF55gvrZ0Bc0bj/mqvg==} + engines: {node: '>=20.18.1'} + chokidar@1.7.0: resolution: {integrity: sha512-mk8fAWcRUOxY7btlLtitj3A45jOwSAxH4tOFOoEGbVsl6cL6pPMWUy7dwZ/canfj3QEdP6FHSnf/l1c6/WkzVg==} @@ -879,6 +898,13 @@ packages: cross-spawn@5.1.0: resolution: {integrity: sha512-pTgQJ5KC0d2hcY8eyL1IzlBPYjTkyH72XRZPnLyKus2mBfNjQs3klqbJU2VILqZryAZUt9JOb3h/mWMy23/f5A==} + css-select@5.2.2: + resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==} + + css-what@6.2.2: + resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==} + engines: {node: '>= 6'} + currently-unhandled@0.4.1: resolution: {integrity: sha512-/fITjgjGU50vjQ4FH6eUoYu+iUoUKIXws2hL15JJpIR+BbTxaXQsMuuyjtNh2WqsSBS5nsaZHFsFecyw5CCAng==} engines: {node: '>=0.10.0'} @@ -1017,6 +1043,19 @@ packages: resolution: {integrity: sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==} engines: {node: '>=0.10.0'} + dom-serializer@2.0.0: + resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} + + domelementtype@2.3.0: + resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==} + + domhandler@5.0.3: + resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==} + engines: {node: '>= 4'} + + domutils@3.2.2: + resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==} + dot-prop@3.0.0: resolution: {integrity: sha512-k4ELWeEU3uCcwub7+dWydqQBRjAjkV9L33HjVRG5Xo2QybI6ja/v+4W73SRi8ubCqJz0l9XsTP1NbewfyqaSlw==} engines: {node: '>=0.10.0'} @@ -1055,6 +1094,21 @@ packages: resolution: {integrity: sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==} engines: {node: '>= 0.8'} + encoding-sniffer@0.2.1: + resolution: {integrity: sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==} + + entities@4.5.0: + resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} + engines: {node: '>=0.12'} + + entities@6.0.1: + resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==} + engines: {node: '>=0.12'} + + entities@7.0.1: + resolution: {integrity: sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA==} + engines: {node: '>=0.12'} + env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -1323,6 +1377,15 @@ packages: resolution: {integrity: sha512-oIDB1rXf3BUnn00bh2jVM0byuqr94rBh6g7ZfdKcbmp1we2GQtPzKdloyvBXHs+q3fvxB8EqX5ecFba3RwCSjA==} engines: {node: '>=0.10.0'} + follow-redirects@1.15.11: + resolution: {integrity: sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==} + engines: {node: '>=4.0'} + peerDependencies: + debug: '*' + peerDependenciesMeta: + debug: + optional: true + for-each@0.3.5: resolution: {integrity: sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg==} engines: {node: '>= 0.4'} @@ -1342,6 +1405,10 @@ packages: resolution: {integrity: sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==} engines: {node: '>= 0.12'} + form-data@4.0.5: + resolution: {integrity: sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==} + engines: {node: '>= 6'} + formidable@1.2.6: resolution: {integrity: sha512-KcpbcpuLNOwrEjnbpMC0gS+X8ciDoZE1kkqzat4a8vrprf+s9pKNQ/QIwWfbfs4ltgmFl3MD177SNTkve3BwGQ==} deprecated: 'Please upgrade to latest, formidable@v2 or formidable@v3! Check these notes: https://bit.ly/2ZEqIau' @@ -1528,6 +1595,9 @@ packages: hosted-git-info@2.8.9: resolution: {integrity: sha512-mxIDAb9Lsm6DoOJ7xH+5+X4y1LU/4Hi50L9C5sIswK3JzULS4bwk1FvjdBgvYR4bzT4tuUQiC15FE2f5HbLvYw==} + htmlparser2@10.1.0: + resolution: {integrity: sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ==} + http-assert@1.5.0: resolution: {integrity: sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==} engines: {node: '>= 0.8'} @@ -1552,6 +1622,10 @@ packages: resolution: {integrity: sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==} engines: {node: '>=0.10.0'} + iconv-lite@0.6.3: + resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==} + engines: {node: '>=0.10.0'} + ignore-by-default@1.0.1: resolution: {integrity: sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==} @@ -2269,6 +2343,9 @@ packages: resolution: {integrity: sha512-lJxZYlT4DW/bRUtFh1MQIWqmLwQfAxnqWG4HhEdjMlkrJYnJn0Jrr2u3mgxqaWsdiBc76TYkTG/mhrnYTuzfHw==} engines: {node: '>=4'} + nth-check@2.1.1: + resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} + number-is-nan@1.0.1: resolution: {integrity: sha512-4jbtZXNAsfZbAHiiqjLPBiCl16dES1zI4Hpzzxw61Tk+loF+sBDBKx1ICKKKwIqQ7M0mFn1TmkN7euSncWgHiQ==} engines: {node: '>=0.10.0'} @@ -2465,6 +2542,15 @@ packages: resolution: {integrity: sha512-LpH1Cf5EYuVjkBvCDBYvkUPh+iv2bk3FHflxHkpCYT0/FZ1d3N3uJaLiHr4yGuMcFUhv6eAivitTvWZI4B/chg==} engines: {node: '>=0.10.0'} + parse5-htmlparser2-tree-adapter@7.1.0: + resolution: {integrity: sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==} + + parse5-parser-stream@7.1.2: + resolution: {integrity: sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==} + + parse5@7.3.0: + resolution: {integrity: sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==} + parseurl@1.3.3: resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==} engines: {node: '>= 0.8'} @@ -2603,6 +2689,9 @@ packages: resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==} engines: {node: '>=0.4.0'} + proxy-from-env@1.1.0: + resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==} + pseudomap@1.0.2: resolution: {integrity: sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ==} @@ -3332,6 +3421,10 @@ packages: undici-types@7.16.0: resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==} + undici@7.22.0: + resolution: {integrity: sha512-RqslV2Us5BrllB+JeiZnK4peryVTndy9Dnqq62S3yYRRTj0tFQCwEniUy2167skdGOy3vqRzEvl1Dm4sV2ReDg==} + engines: {node: '>=20.18.1'} + union-value@1.0.1: resolution: {integrity: sha512-tJfXmxMeWYnczCVs7XAEvIV7ieppALdyepWMkHkwciRpZraG/xwT+s2JN8+pr1+8jCRf80FFzvr+MpQeeoF4Xg==} engines: {node: '>=0.10.0'} @@ -3406,6 +3499,14 @@ packages: resolution: {integrity: sha512-ZZKSmDAEFOijERBLkmYfJ+vmk3w+7hOLYDNkRCuRuMJGEmqYNCNLyBBFwWKVMhfwaEF3WOd0Zlw86U/WC/+nYw==} engines: {'0': node >=0.6.0} + whatwg-encoding@3.1.1: + resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==} + engines: {node: '>=18'} + + whatwg-mimetype@4.0.0: + resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==} + engines: {node: '>=18'} + which-boxed-primitive@1.1.1: resolution: {integrity: sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==} engines: {node: '>= 0.4'} @@ -3934,6 +4035,14 @@ snapshots: aws4@1.13.2: {} + axios@1.13.5: + dependencies: + follow-redirects: 1.15.11 + form-data: 4.0.5 + proxy-from-env: 1.1.0 + transitivePeerDependencies: + - debug + babel-code-frame@6.26.0: dependencies: chalk: 1.1.3 @@ -4232,6 +4341,8 @@ snapshots: bluebird@3.7.2: {} + boolbase@1.0.0: {} + boxen@0.6.0: dependencies: ansi-align: 1.1.0 @@ -4366,6 +4477,29 @@ snapshots: chardet@0.4.2: {} + cheerio-select@2.1.0: + dependencies: + boolbase: 1.0.0 + css-select: 5.2.2 + css-what: 6.2.2 + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.2.2 + + cheerio@1.2.0: + dependencies: + cheerio-select: 2.1.0 + dom-serializer: 2.0.0 + domhandler: 5.0.3 + domutils: 3.2.2 + encoding-sniffer: 0.2.1 + htmlparser2: 10.1.0 + parse5: 7.3.0 + parse5-htmlparser2-tree-adapter: 7.1.0 + parse5-parser-stream: 7.1.2 + undici: 7.22.0 + whatwg-mimetype: 4.0.0 + chokidar@1.7.0: dependencies: anymatch: 1.3.2 @@ -4554,6 +4688,16 @@ snapshots: shebang-command: 1.2.0 which: 1.3.1 + css-select@5.2.2: + dependencies: + boolbase: 1.0.0 + css-what: 6.2.2 + domhandler: 5.0.3 + domutils: 3.2.2 + nth-check: 2.1.1 + + css-what@6.2.2: {} + currently-unhandled@0.4.1: dependencies: array-find-index: 1.0.2 @@ -4668,6 +4812,24 @@ snapshots: dependencies: esutils: 2.0.3 + dom-serializer@2.0.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + entities: 4.5.0 + + domelementtype@2.3.0: {} + + domhandler@5.0.3: + dependencies: + domelementtype: 2.3.0 + + domutils@3.2.2: + dependencies: + dom-serializer: 2.0.0 + domelementtype: 2.3.0 + domhandler: 5.0.3 + dot-prop@3.0.0: dependencies: is-obj: 1.0.1 @@ -4710,6 +4872,17 @@ snapshots: encodeurl@1.0.2: {} + encoding-sniffer@0.2.1: + dependencies: + iconv-lite: 0.6.3 + whatwg-encoding: 3.1.1 + + entities@4.5.0: {} + + entities@6.0.1: {} + + entities@7.0.1: {} + env-paths@2.2.1: {} equal-length@1.0.1: {} @@ -5101,6 +5274,8 @@ snapshots: fn-name@2.0.1: {} + follow-redirects@1.15.11: {} + for-each@0.3.5: dependencies: is-callable: 1.2.7 @@ -5119,6 +5294,14 @@ snapshots: combined-stream: 1.0.8 mime-types: 2.1.35 + form-data@4.0.5: + dependencies: + asynckit: 0.4.0 + combined-stream: 1.0.8 + es-set-tostringtag: 2.1.0 + hasown: 2.0.2 + mime-types: 2.1.35 + formidable@1.2.6: {} fragment-cache@0.2.1: @@ -5320,6 +5503,13 @@ snapshots: hosted-git-info@2.8.9: {} + htmlparser2@10.1.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.2.2 + entities: 7.0.1 + http-assert@1.5.0: dependencies: deep-equal: 1.0.1 @@ -5358,6 +5548,10 @@ snapshots: dependencies: safer-buffer: 2.1.2 + iconv-lite@0.6.3: + dependencies: + safer-buffer: 2.1.2 + ignore-by-default@1.0.1: {} ignore@3.3.10: {} @@ -6130,6 +6324,10 @@ snapshots: dependencies: path-key: 2.0.1 + nth-check@2.1.1: + dependencies: + boolbase: 1.0.0 + number-is-nan@1.0.1: {} nunjucks@3.2.4: @@ -6305,6 +6503,19 @@ snapshots: parse-ms@1.0.1: {} + parse5-htmlparser2-tree-adapter@7.1.0: + dependencies: + domhandler: 5.0.3 + parse5: 7.3.0 + + parse5-parser-stream@7.1.2: + dependencies: + parse5: 7.3.0 + + parse5@7.3.0: + dependencies: + entities: 6.0.1 + parseurl@1.3.3: {} pascalcase@0.1.1: {} @@ -6408,6 +6619,8 @@ snapshots: progress@2.0.3: {} + proxy-from-env@1.1.0: {} + pseudomap@1.0.2: {} psl@1.15.0: @@ -7368,6 +7581,8 @@ snapshots: undici-types@7.16.0: {} + undici@7.22.0: {} + union-value@1.0.1: dependencies: arr-union: 3.1.0 @@ -7438,6 +7653,12 @@ snapshots: core-util-is: 1.0.2 extsprintf: 1.3.0 + whatwg-encoding@3.1.1: + dependencies: + iconv-lite: 0.6.3 + + whatwg-mimetype@4.0.0: {} + which-boxed-primitive@1.1.1: dependencies: is-bigint: 1.1.0 diff --git a/sql/update_20260224_spider_news.sql b/sql/update_20260224_spider_news.sql new file mode 100644 index 0000000..1a43a3e --- /dev/null +++ b/sql/update_20260224_spider_news.sql @@ -0,0 +1,21 @@ +-- 爬虫-新闻数据中间表 +CREATE TABLE IF NOT EXISTS `pap_spider_news` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `origin_id` int(11) NOT NULL DEFAULT 0 COMMENT '原站文章ID', + `title` varchar(500) NOT NULL DEFAULT '' COMMENT '文章标题', + `category` varchar(100) DEFAULT '' COMMENT '原站栏目分类', + `publish_time` datetime DEFAULT NULL COMMENT '原站发布时间', + `content` longtext COMMENT '处理后的正文(图片已转存)', + `content_original` longtext COMMENT '原始正文HTML(未处理图片)', + `cover` varchar(500) DEFAULT '' COMMENT '封面图(转存后)', + `cover_original` varchar(500) DEFAULT '' COMMENT '原始封面图URL', + `images` text COMMENT '图片映射JSON [{original, new}]', + `status` tinyint(1) DEFAULT 0 COMMENT '0待爬取 1已爬取详情 2图片已转存 3已同步到article -1失败', + `article_id` int(11) DEFAULT 0 COMMENT '同步到article表后的ID', + `error_msg` varchar(1000) DEFAULT '' COMMENT '错误信息', + `create_time` datetime DEFAULT CURRENT_TIMESTAMP, + `update_time` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (`id`), + UNIQUE KEY `uk_origin_id` (`origin_id`), + KEY `idx_status` (`status`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='爬虫-新闻数据'; diff --git a/src/config/router.js b/src/config/router.js index cdfc8d3..6c28447 100644 --- a/src/config/router.js +++ b/src/config/router.js @@ -108,6 +108,14 @@ module.exports = [ ['/admin/content/person/edit', 'admin/content/person/edit', 'post'], ['/admin/content/person/delete', 'admin/content/person/delete', 'post'], + // 爬虫工具 + ['/admin/spider/fetch-list', 'admin/spider/fetchList'], + ['/admin/spider/fetch-detail', 'admin/spider/fetchDetail'], + ['/admin/spider/transfer-images', 'admin/spider/transferImages'], + ['/admin/spider/sync', 'admin/spider/sync'], + ['/admin/spider/status', 'admin/spider/status'], + ['/admin/spider/retry', 'admin/spider/retry'], + // 内容管理 - 捐赠收支 ['/admin/content/donation', 'admin/content/donation/index'], ['/admin/content/donation/stat', 'admin/content/donation/stat'], diff --git a/src/controller/admin/spider.js b/src/controller/admin/spider.js new file mode 100644 index 0000000..b1426b9 --- /dev/null +++ b/src/controller/admin/spider.js @@ -0,0 +1,439 @@ +const Base = require('../base'); +const axios = require('axios'); +const cheerio = require('cheerio'); +const fs = require('fs'); +const path = require('path'); +const COS = require('cos-nodejs-sdk-v5'); +const cosConfig = require('../../config/cos.js'); + +// 原站配置 +const ORIGIN = { + base: 'http://www.vkfoundation.cn', + listUrl: 'http://www.vkfoundation.cn/manage/function.php', + cookie: 'PHPSESSID=kjcj0j7qtdlqhm0rdqie67qnh6; zzz811_token=0c091635c6decb9333232a08bef265cf; zzz904_adminpass=0; zzz904_adminname=vkadmin; zzz904_admintime=1771944843; zzz904_adminface=..%2Fplugins%2Fface%2Fface01.png' +}; + +// 请求间隔(毫秒) +const DELAY = 500; +const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + +module.exports = class extends Base { + + /** + * 步骤1:拉取列表并入库 + * GET /admin/spider/fetch-list + */ + async fetchListAction() { + try { + const res = await axios.post( + `${ORIGIN.listUrl}?act=contentlist&id=&type=news`, + 'order=asc&limit=9999&offset=0', + { + headers: { + Cookie: ORIGIN.cookie, + 'Content-Type': 'application/x-www-form-urlencoded' + } + } + ); + + const data = res.data; + if (!data || !data.rows) { + return this.json({ code: 1, msg: '列表数据获取失败' }); + } + + const model = this.model('spider_news'); + let inserted = 0; + let skipped = 0; + + for (const row of data.rows) { + const originId = parseInt(row.id); + // 从HTML片段中提取数据 + const title = this.extractTitle(row.c_title); + const category = this.extractCategory(row.c_sid); + const publishTime = this.extractTime(row.c_addtime); + + // 检查是否已存在 + const exists = await model.where({ origin_id: originId }).find(); + if (exists && exists.id) { + skipped++; + continue; + } + + await model.add({ + origin_id: originId, + title, + category, + publish_time: publishTime, + status: 0 + }); + inserted++; + } + + return this.json({ + code: 0, + msg: `列表拉取完成,新增${inserted}条,跳过${skipped}条`, + data: { total: data.total, inserted, skipped } + }); + } catch (err) { + think.logger.error('拉取列表失败:', err); + return this.json({ code: 1, msg: '拉取列表失败: ' + err.message }); + } + } + + /** + * 步骤2:爬取详情页内容 + * GET /admin/spider/fetch-detail + * 参数: id (可选,指定单条;不传则处理所有status=0的) + */ + async fetchDetailAction() { + const targetId = this.get('id'); + const model = this.model('spider_news'); + + let list; + if (targetId) { + list = await model.where({ origin_id: targetId }).select(); + } else { + list = await model.where({ status: 0 }).select(); + } + + if (!list.length) { + return this.json({ code: 0, msg: '没有待爬取的记录' }); + } + + let success = 0; + let fail = 0; + + for (const item of list) { + try { + const detailUrl = `${ORIGIN.base}/?news/${item.origin_id}.html`; + const res = await axios.get(detailUrl, { + headers: { Cookie: ORIGIN.cookie }, + responseType: 'arraybuffer' + }); + + // 处理编码 + const html = res.data.toString('utf-8'); + const $ = cheerio.load(html); + + // 从截图看,内容在 div.maximg 中 + const contentEl = $('div.maximg'); + let contentHtml = ''; + if (contentEl.length) { + contentHtml = contentEl.html() || ''; + } else { + // 备选:尝试 .picture-7 容器 + const pictureEl = $('div.picture-7'); + contentHtml = pictureEl.html() || ''; + } + + // 提取封面图(取正文中第一张图) + let coverOriginal = ''; + const firstImg = contentEl.find('img').first(); + if (firstImg.length) { + coverOriginal = firstImg.attr('src') || ''; + if (coverOriginal && !coverOriginal.startsWith('http')) { + coverOriginal = ORIGIN.base + (coverOriginal.startsWith('/') ? '' : '/') + coverOriginal; + } + } + + await model.where({ id: item.id }).update({ + content_original: contentHtml, + cover_original: coverOriginal, + status: 1, + error_msg: '' + }); + success++; + await sleep(DELAY); + } catch (err) { + think.logger.error(`爬取详情失败 origin_id=${item.origin_id}:`, err); + await model.where({ id: item.id }).update({ + status: -1, + error_msg: err.message + }); + fail++; + } + } + + return this.json({ + code: 0, + msg: `详情爬取完成,成功${success}条,失败${fail}条`, + data: { success, fail } + }); + } + + /** + * 步骤3:转存图片 + * GET /admin/spider/transfer-images + * 参数: id (可选,指定单条;不传则处理所有status=1的) + */ + async transferImagesAction() { + const targetId = this.get('id'); + const model = this.model('spider_news'); + + let list; + if (targetId) { + list = await model.where({ origin_id: targetId }).select(); + } else { + list = await model.where({ status: 1 }).select(); + } + + if (!list.length) { + return this.json({ code: 0, msg: '没有待转存图片的记录' }); + } + + const cos = new COS({ + SecretId: cosConfig.secretId, + SecretKey: cosConfig.secretKey + }); + + let success = 0; + let fail = 0; + + for (const item of list) { + try { + const $ = cheerio.load(item.content_original, null, false); + const imgElements = $('img'); + const imageMap = []; + + for (let i = 0; i < imgElements.length; i++) { + const img = imgElements.eq(i); + let src = img.attr('src') || ''; + if (!src) continue; + + // 补全URL + const originalUrl = src.startsWith('http') ? src : ORIGIN.base + (src.startsWith('/') ? '' : '/') + src; + + try { + const newUrl = await this.downloadAndUpload(cos, originalUrl); + img.attr('src', newUrl); + imageMap.push({ original: originalUrl, new: newUrl }); + } catch (imgErr) { + think.logger.warn(`图片转存失败 ${originalUrl}:`, imgErr.message); + imageMap.push({ original: originalUrl, new: '', error: imgErr.message }); + } + await sleep(300); + } + + // 转存封面图 + let cover = ''; + if (item.cover_original) { + try { + cover = await this.downloadAndUpload(cos, item.cover_original); + } catch (coverErr) { + think.logger.warn(`封面图转存失败:`, coverErr.message); + } + } + + await model.where({ id: item.id }).update({ + content: $.html(), + cover, + images: JSON.stringify(imageMap), + status: 2, + error_msg: '' + }); + success++; + } catch (err) { + think.logger.error(`图片转存失败 origin_id=${item.origin_id}:`, err); + await model.where({ id: item.id }).update({ + status: -1, + error_msg: '图片转存失败: ' + err.message + }); + fail++; + } + } + + return this.json({ + code: 0, + msg: `图片转存完成,成功${success}条,失败${fail}条`, + data: { success, fail } + }); + } + + /** + * 下载图片并上传到COS + */ + async downloadAndUpload(cos, imageUrl) { + // 下载图片 + const res = await axios.get(imageUrl, { + responseType: 'arraybuffer', + headers: { Cookie: ORIGIN.cookie }, + timeout: 15000 + }); + + const buffer = Buffer.from(res.data); + const contentType = res.headers['content-type'] || 'image/jpeg'; + + // 生成COS路径 + const now = new Date(); + const dateFolder = `${now.getFullYear()}/${String(now.getMonth() + 1).padStart(2, '0')}/${String(now.getDate()).padStart(2, '0')}`; + const ext = this.getExtFromContentType(contentType); + const fileName = `spider_${Date.now()}_${Math.random().toString(36).slice(2, 8)}${ext}`; + const cosKey = `uploads/${dateFolder}/${fileName}`; + + // 上传到COS + await new Promise((resolve, reject) => { + cos.putObject({ + Bucket: cosConfig.bucket, + Region: cosConfig.region, + Key: cosKey, + Body: buffer, + ContentType: contentType + }, (err, data) => { + if (err) reject(err); + else resolve(data); + }); + }); + + return `${cosConfig.cdnUrl}/${cosKey}`; + } + + getExtFromContentType(contentType) { + const map = { + 'image/jpeg': '.jpg', + 'image/jpg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'image/bmp': '.bmp' + }; + return map[contentType] || '.jpg'; + } + + /** + * 步骤4:同步到article表 + * GET /admin/spider/sync + * 参数: col (必填,目标栏目key) + * 参数: id (可选,指定单条origin_id) + */ + async syncAction() { + const col = this.get('col'); + if (!col) { + return this.json({ code: 1, msg: '请指定目标栏目col参数' }); + } + + const column = await this.model('column').where({ key: col, is_deleted: 0 }).find(); + if (!column || !column.id) { + return this.json({ code: 1, msg: '栏目不存在: ' + col }); + } + + const targetId = this.get('id'); + const model = this.model('spider_news'); + + let list; + if (targetId) { + list = await model.where({ origin_id: targetId, status: 2 }).select(); + } else { + list = await model.where({ status: 2 }).select(); + } + + if (!list.length) { + return this.json({ code: 0, msg: '没有待同步的记录' }); + } + + const articleModel = this.model('article'); + let success = 0; + let fail = 0; + + for (const item of list) { + try { + const articleId = await articleModel.add({ + column_id: column.id, + title: item.title, + summary: '', + content: item.content || item.content_original || '', + cover: item.cover || '', + category: item.category || '', + is_top: 0, + is_recommend: 0, + sort: 0, + status: 1, + publish_time: item.publish_time + }); + + await model.where({ id: item.id }).update({ + article_id: articleId, + status: 3 + }); + success++; + } catch (err) { + think.logger.error(`同步失败 origin_id=${item.origin_id}:`, err); + await model.where({ id: item.id }).update({ + error_msg: '同步失败: ' + err.message + }); + fail++; + } + } + + return this.json({ + code: 0, + msg: `同步完成,成功${success}条,失败${fail}条`, + data: { success, fail } + }); + } + + /** + * 查看爬取状态 + * GET /admin/spider/status + */ + async statusAction() { + const model = this.model('spider_news'); + const total = await model.count(); + const pending = await model.where({ status: 0 }).count(); + const detailed = await model.where({ status: 1 }).count(); + const transferred = await model.where({ status: 2 }).count(); + const synced = await model.where({ status: 3 }).count(); + const failed = await model.where({ status: -1 }).count(); + + return this.json({ + code: 0, + data: { + total, + pending, // 待爬取详情 + detailed, // 已爬详情,待转存图片 + transferred, // 图片已转存,待同步 + synced, // 已同步 + failed // 失败 + } + }); + } + + /** + * 重试失败的记录 + * GET /admin/spider/retry + * 参数: step (detail|images) 重试哪一步 + */ + async retryAction() { + const step = this.get('step'); + const model = this.model('spider_news'); + + if (step === 'detail') { + await model.where({ status: -1 }).update({ status: 0, error_msg: '' }); + } else if (step === 'images') { + await model.where({ status: -1 }).update({ status: 1, error_msg: '' }); + } else { + return this.json({ code: 1, msg: '请指定step参数: detail 或 images' }); + } + + return this.json({ code: 0, msg: '已重置失败记录' }); + } + + // ---- 辅助方法:从HTML片段提取数据 ---- + + extractTitle(html) { + if (!html) return ''; + const match = html.match(/(?:value|placeholder)="([^"]+)"/); + return match ? match[1] : ''; + } + + extractCategory(html) { + if (!html) return ''; + const match = html.match(/>([^<]+)