FunTester httpclient 爬虫实例——爬取三级中学名

FunTester · 2020年03月29日 · 993 次阅读

本人在使用 httpclient 的过程中,突然想起来可以爬取一些数据,比如全国的中学名。当然不是空穴来风,之前也做过这方面的爬虫,不过基于 selenium 做的 UI 脚本,效率非常慢,而且很不稳定,所以这次采取了接口的形式,果然效率提升了几个档次。一共 6 万 + 数据,用了 16 分钟左右,期间包括数据库的存储。现在分享代码供大家参考。关键信息隐去,大家看一下思路就好了。

package practise;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.methods.HttpGet;
import net.sf.json.JSONObject;
import source.ApiLibrary;
import source.Concurrent;

public class Crawler extends ApiLibrary {
    public static String host = "";
    public static Map<String, Integer> countrys = new HashMap<>();
    public static Map<String, Integer> citys = new HashMap<>();
    public static Map<String, Integer> address = new HashMap<>();
    public static Map<String, Integer> school = new HashMap<>();
    public static List<String> total = new ArrayList<>();

    public static void main(String[] args) {
        Crawler crawler = new Crawler();
        crawler.getCountry1();// 省份
        Set<String> countryId = countrys.keySet();
        for (String name : countryId) {
            int id = countrys.get(name);
            crawler.getCountry2(id);// 
            Set<String> cityId = citys.keySet();
            for (String city : cityId) {
                int cid = citys.get(city);
                crawler.getCountry3(cid);// 
                Set<String> adresss = address.keySet();
                for (String adres : adresss) {
                    int aid = address.get(adres);
                    crawler.getCountry4(aid);// 
                    Set<String> schol = school.keySet();
                    for (String sch : schol) {
                        String line = name + PART + city + PART + adres + PART + sch;
                        total.add(line);
                    }
                }
            }
        }
        Concurrent.saveRequestTimes(total);
        testOver();
    }

    /**
     * 查询省份
     */
    public void getCountry1() {
        String url = host + "/user/editinfo/getSchollCountryList";
        HttpGet httpGet = getHttpGet(url);
        // httpGet.addHeader("Cookie", cookies);
        // httpGet.addHeader("User-Agent", userangent);
        JSONObject response = getHttpResponseEntityByJson(httpGet);
        String[] country = response.getString("content").split("</a>");
        int size = country.length;
        for (int i = 0; i < size; i++) {
            String msg = country[i];
            int code = getCode(msg);
            String name = getName(msg);
            countrys.put(name, code);
        }
    }

    /**
     * 查询市
     * 
     * @param id
     */
    public void getCountry2(int id) {
        String url = host + "/user/editinfo/getSchollCityList?region_id=" + id;
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponseEntityByJson(httpGet);
        String[] ssString = response.getString("content").split("</a>");
        int size = ssString.length;
        citys.clear();
        for (int i = 0; i < size; i++) {
            String msg = ssString[i];
            int code = getCode(msg);
            String name = getName(msg);
            citys.put(name, code);
        }

    }

    /**
     * 查询县
     * 
     * @param id
     */
    public void getCountry3(int id) {
        String url = host + "/user/editinfo/getSchollAddressList?region_id=" + id;
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponseEntityByJson(httpGet);
        String[] ssString = response.getString("content").split("</a>");
        int size = ssString.length;
        address.clear();
        for (int i = 0; i < size; i++) {
            String msg = ssString[i];
            int code = getCode(msg);
            String name = getName(msg);
            address.put(name, code);
        }
    }

    /**
     * 查询学校
     * 
     * @param id
     */
    public void getCountry4(int id) {
        String url = host + "/user/editinfo/getSchoolNameList?region_id=" + id;
        HttpGet httpGet = getHttpGet(url);
        JSONObject response = getHttpResponseEntityByJson(httpGet);
        String[] ssString = response.getString("content").split("</a>");
        int size = ssString.length;
        school.clear();
        for (int i = 0; i < size; i++) {
            String msg = ssString[i];
            int code = getCode(msg);
            String name = getName(msg);
            school.put(name, code);
        }
    }

    /**
     * 获取 code
     * 
     * @param text
     * @return
     */
    public int getCode(String text) {
        int code = 0;
        Pattern pattern = Pattern.compile("\"\\d+\"");
        Matcher matcher = pattern.matcher(text);
        if (matcher.find()) {
            code = changeStringToInt(matcher.group(0).replace("\"", ""));
        }
        return code;
    }

    /**
     * 获取名称
     * 
     * @param text
     * @return
     */
    public String getName(String text) {
        String name = text.substring(text.lastIndexOf(">") + 1, text.length());
        return name;
    }

}

下面是爬取到数据截图

技术类文章精选

非技术文章精选

大咖风采

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
暂无回复。
需要 登录 后方可回复, 如果你还没有账号请点击这里 注册