Node.js   发布时间:2022-04-24  发布网站:大佬教程  code.js-code.com
大佬教程收集整理的这篇文章主要介绍了Nodejs爬取10G妹子套图cheerio大佬教程大佬觉得挺不错的,现在分享给大家,也给大家做个参考。

某站全套妹子图采集

图片来源www.mzitu.com

源码

https://github.com/zhaojunlik...

Usage

安装依赖包

npm install
配置redis(必须)
如果需要服务器存储数据,请对接相关Api地址,进行图片套图和详细图的存储
node app.js

Nodejs爬取10G妹子套图cheerio

核心代码

/**
 * @Author:zhaojunlike
 * @Github:https://github.com/zhaojunlike
 * Created by zhaojunlike on 6/4/2017.
 */
const http = require("http");
const queryString = require("queryString");
const restify = require('restify-clients');
const log4js = require("log4js");
const request = require('request');
const cheerio = require('cheerio');
const redis = require("redis");
const process = require("process");
const download = require('download');
const redisConn = redis.createClient({
    //host: "redis-db",host: "192.168.99.100",port: "6379",});
const url = require('url');
const fs = require('fs');
const path = require('path');
const requestHeaders = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/59.0.3047.4 Safari/537.36","Host": "www.mzitu.com",'token': '4F39500149264DE474AA8FA4C67379D1',};
const webClient = restify.createStringClient({
    url: 'http://www.mzitu.com',headers: requestHeaders
});
const serverApiClient = restify.createStringClient({
    url: 'http://192.168.99.1:8080',//url: 'http://192.168.99.100:81',headers: requestHeaders
});
const redisConfig = {};
const RegxConfig = {
    index_tag: /<dl(.*?)class="tags">([\s\S]*?)<\/dl>/g,};
const cacheKeys = {
    index_tag: "index_tag_queue",tag_list: "tag_list_queue",taotu_list: "taotu_list",page_count: "page_count",page_detail: "page_detail_queue",img_download_url: "img_queue",};
const RemoteConfig = {
    host: 'http://www.mzitu.com',zhuanti: 'http://www.mzitu.com/zhuanti/',page: 'http://www.mzitu.com/page/',};
const ServerApi = {
    DocumentAdd: "/Document_add.action",PictureAdd: "/Picture_add.action",};
let SpiderIDLE = {
    start: false,index_success: false,img_page_success: false,img_down_success: false,img_taotu_success: false,//BASE_PATH: "../storage/download",BASE_PATH: "../storage"
};

const Tools = {
    parseUri: function (uri) {
        let filePath = url.parse(uri).path;
        let tmp = filePath.split('/');
        return {
            filename: tmp.pop(),filepath: tmp.join("/")
        };
    },checkDir: function (dirPath) {
        let mode = 777;
        if (!fs.existsSync(dirPath)) {
            let tmp;
            dirPath.split('/').forEach(function (dirName) {
                if (tmp) {
                    tmp = path.join(tmp,dirName);
                }
                else {
                    tmp = dirname;
                }
                if (!fs.existsSync(tmp)) {
                    if (!fs.mkdirSync(tmp,modE)) {
                        return false;
                    }
                }
            });
        }
        return true;
    }
};

const Spider = {
    start: function () {
        webClient.get('/zhuanti/',function (err,req,res,data) {
            if (err) {
                return err;
            }
            let $ = cheerio.load(data);
            $(".postlist .tags dd").each(function (index,item) {
                let $this = $(this);
                let tag = {};
                tag.title = $this.find('img').attr("alt");
                tag.bAnner = $this.find('img').attr("src");
                tag.url = $this.find("a").attr("href");
                //pop进入队列
                redisConn.rpush(CacheKeys.index_tag,JSON.Stringify(tag),reply) {
                    console.log(err,reply);
                });
            });
        });
    },//1.获取首页,获取首页有多少个PageCount
    getPagelist: function (callBACk) {
        webClient.get('/',data) {
            if (err) {
                return err;
            }
            let $ = cheerio.load(data);
            $(".nav-links a[class='page-numbers']").each(function (index,item) {
                let $this = $(this);
                //pop进入队列
                let html = $this.html();
                let page = html.match(/\d+/);
                if (parseInt(pagE)) {
                    redisConn.getset(CacheKeys.page_count,pagE);
                }
            });
            redisConn.get(CacheKeys.page_count,reply) {
                callBACk(reply);
            });
        });
    },//2.加入套图页面数据,就是套图的数据
    getImgPage: function (callBACk) {
        redisConn.decr(CacheKeys.page_count,reply) {
            if (err || !reply) {
                return false;
            }
            if (parseInt(reply) <= 1) {
                console.log("页面套图数据已经采集完毕了!!!!");//
                SpiderIDLE.img_page_success = true;
                return false;
            }
            console.log(`开始采集页面:${reply}`);
            //采集这个页面
            webClient.get(`/page/${reply}/`,data) {
                if (err || !data) {
                    return false;
                }
                let $ = cheerio.load(data);
                $(".postlist #pins li").each(function (index,item) {
                    let $this = $(this);
                    let document = {
                        title: "",url: "",remote_path: '/',content: "",page_num: "",category_id: "",create_time: "",update_time: "",good_count: 0,view_count: 0,remote_id: 0,};
                    document.create_time = $this.find(".time").html();
                    document.view_count = Math.random() * 1000000;
                    document.title = $this.find("img").attr("alt");
                    document.URL = $this.find("a").attr("href");
                    document.remote_id = document.URL.match(/\d+/)[0];
                    document.remote_path = '/' + document.remote_id;
                    document.content = $this.find("img").attr("data-original");
                    document.category_id = reply;
                    document.page_num = reply;
                    //加入队列
                    //TODO 写入gateway接口
                    redisConn.rpush(CacheKeys.page_detail,JSON.Stringify(document),reply) {
                        callBACk(document);
                    });
                });
            });

        });
    },//3.采集套图具体图片,就是套图数量的数据
    getTaoTuImgs: function (callBACk) {
        redisConn.lpop(CacheKeys.page_detail,reply) {
            if (err || !reply) return;
            let document = JSON.parse(reply);
            let rePath = document.remote_path = '/' + document.URL.match(/\d+/)[0];
            console.log("PATH:",rePath);
            webClient.get(rePath,data) {
                if (err || !data) return;
                let $ = cheerio.load(data);
                let PageCount = $(".main .pagenavi a span").eq(-2).html();
                document.detail_count = PageCount;
                //动态生成链接图片链接
                console.log(`获取套图:${document.category_iD},${PageCount}张`);
                for (let i = 2; i <= PageCount; i++) {
                    let img = {
                        category_id: document.category_id,img_url: document.URL + "/" + i,remote_id: document.remote_id,remote_path: document.remote_path + "/" + i
                    };
                    //这里可能直接push了10张图进去
                    redisConn.rpush(CacheKeys.img_download_url,JSON.Stringify(img),reply) {
                        console.log(`加入套图:${img.remote_iD}`,err);
                    });
                }
            });
        });
    },//4.具体下载
    downloadYY: function (callBACk) {
        //下载图片
        redisConn.lpop(CacheKeys.img_download_url,reply) {
            if (err || !reply) {
                return false;
            }
            let img = JSON.parse(reply);
            webClient.get(img.remote_path,data) {
                if (err) {
                    console.log(err);
                    return;
                }
                if (err || !data) return;
                let $ = cheerio.load(data);
                //找到图片并且下载
                let urlImg = $(".main .main-image img").attr("src");
                img.url_img = urlImg;
                img.path = `/${img.category_iD}/`;

                let fileDetail = Tools.parseUri(urlImg);
                let savePath = `${SpiderIDLE.bASE_PATH}/images/${img.category_iD}${fileDetail.filepath}`;
                img.LOCATIOn = `/images/${img.category_iD}${fileDetail.filepath}/${fileDetail.filename}`;
                Tools.checkDir(savePath);
                download(urlImg,savePath).then(function () {
                    console.log("下载真实大图:",urlImg,",存储:",img.LOCATIOn);
                    callBACk(img);
                });
            });
        });

    },downloadThumbs: function () {

    },clearredis: function () {
        redisConn.flushdb(function (err) {
            console.log("清空redis Cache成功",err);
        });
    },};

//Spider.clearredis();
const SpiderTimer = seTinterval(function () {
    if (SpiderIDLE.start !== truE) {
        return false;
    }
    if (SpiderIDLE.img_page_success !== truE) {
        Spider.getImgPage(function (document) {
            let urlImg = document.content;
            //1.下载到本地进行存储
            let fileDetail = Tools.parseUri(urlImg);
            let savePath = SpiderIDLE.bASE_PATH + '/bAnner' + fileDetail.filepath;
            Tools.checkDir(savePath);
            download(urlImg,savePath).then(function () {
                console.log("DownloadThumbsImg:","SavePath:",savePath);
            });

            //2.提交给服务器,这个只是页面的
            document.content = "/bAnner" + fileDetail.filepath + "/" + fileDetail.filename;
            document.view_count = parseInt(document.view_count);
            serverApiClient.post(ServerApi.DocumentAdd,document,data) {
                console.log(`图片Document:${document.category_iD},写入服务器成功`);
            });

        });
    }
    Spider.downloadYY(function (picturE) {
        serverApiClient.post(ServerApi.PictureAdd,picture,data) {
            console.log(`真实Picture:${picture.remote_iD},存储服务器成功`);
        });
    });
    Spider.getTaoTuImgs(function (document) {

    });
},100);


Spider.getPagelist(function (count) {
    console.log(`一共有:${Count}个页面需要采集`);
    SpiderIDLE.start = true;
});


process.on("exit",function () {
    redisConn.end(true);
    clearInterval(SpiderTimer);
    Spider.clearredis();
    console.log("exit");
});

package.json

{
  "name": "nodespider","version": "1.0.0","description": "","main": "index.js","scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },"author": "","license": "ISC","dependencies": {
    "cheerio": "^1.0.0-rc.1","download": "^6.2.2","log4js": "^1.1.1","redis": "^2.7.1","request": "^2.81.0","restify-clients": "^1.5.0"
  }
}

大佬总结

以上是大佬教程为你收集整理的Nodejs爬取10G妹子套图cheerio全部内容,希望文章能够帮你解决Nodejs爬取10G妹子套图cheerio所遇到的程序开发问题。

如果觉得大佬教程网站内容还不错,欢迎将大佬教程推荐给程序员好友。

本图文内容来源于网友网络收集整理提供,作为学习参考使用,版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ:384754419,请注明来意。