首頁手記 python爬蟲與初步分析51job及智聯工作

python爬蟲與初步分析51job及智聯工作

標簽：

Java Python

找工作啊找工作

年关将至，相信很多小伙伴在寻思着发展方向。不知有没小伙伴跟笔者找工作时狂刷简历，为的是找出离家近点的公司，但是有很多公司的简介上是没有地址的，所以也只能再点进去看公司详细地址。
因此，写了个爬虫，方便找工作。

项目简介

主要代码是一个job_spider.py(用于爬虫), jobs_data_analyse.py(用于工作数据分析)
笔者先获取工作列表，取得简介后再取得详情。下载完成后，再进行分析。

详细代码如下

job_spider.py

from bs4 import BeautifulSoup import requests import os from enum import Enum from program import config import pandas as pd pd.set_option('expand_frame_repr', False)  # 列太多不换行 class WEBTYPE(Enum):     _51job = '_51job'  # 51job     zhilian = 'zhilian'  # 智联     all = 3  # 所有 #全局变量  记录爬虫次数 SPIDER_REQUIRE_COUNT = 0 #获取51job地址编号对应地名 def get_51job_area_code():     dic = {}     for i in range(1, 37):         url = 'http://search.51job.com/list/{}0000,000000,0000,00,9,99,ios,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format('%02d' % i)         r = requests.get(url, headers=config.http_headers).content.decode('gbk')         area_name = BeautifulSoup(r, 'lxml').find(id="work_position_input")['value']         print(area_name, i)         dic[area_name] = i     file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')     print('51job地区编号文件获取成功')     with open(file_path, "w+", encoding="utf-8") as f:         f.write(str(dic))         f.close() #  检查本地是否有51job地区编号 没有的话就自动获取 def check_area_name():     file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')     if os.path.exists(file_path):         with open(file_path, "r", encoding="utf-8") as f:             if f:                 result = f.read()                 dic = eval(result)                 f.close()     else:         print('51job缺少地区编号文件,获取中')         get_51job_area_code()         check_area_name() def fetch_data( web_type=WEBTYPE.all, keywords=['iOS'], page_count=5, area='深圳'):     if os.path.exists(config.jobs_data_path):         os.remove(config.jobs_data_path)         print('删除之前爬的数据')     if web_type == WEBTYPE._51job:         _fetch_data(web_type, keywords, page_count, area)     elif web_type == WEBTYPE.zhilian:         _fetch_data(web_type, keywords, page_count, area)     elif web_type == WEBTYPE.all:         for type in list(WEBTYPE)[0: -1]:             _fetch_data(type, keywords, page_count, area) def _fetch_data(web_type, keywords, page_count, area):     df = fetch_job_introduce(web_type, keywords, page_count, area)     df = fetch_job_detail(df)     df.fillna(value='', inplace=True)     if os.path.exists(config.jobs_data_path):         df_existed = pd.read_csv(config.jobs_data_path, encoding='utf-8', index_col=0)         df = df.append(df_existed, ignore_index=True)     df.sort_values(by=['地区'], inplace=True)     df.reset_index(drop=True, inplace=True)     df.to_csv(config.jobs_data_path, mode='w', encoding='utf-8')     #去除工作要求 方便查看     df_no_require = df.drop(['要求'], axis=1)     df_no_require['薪酬'] = df_no_require['薪酬'].apply(_make_introduce_beautiful, min_length=12)     df_no_require['地区'] = df_no_require['地区'].apply(_make_introduce_beautiful, min_length=12)     df_no_require['详细地址'] = df_no_require['详细地址'].apply(_make_introduce_beautiful, min_length=30)     df_no_require['链接'] = df_no_require['链接'].apply(_make_introduce_beautiful, min_length=60)     df_no_require.to_csv(config.jobs_data_introduce_path, mode='w', encoding='utf-8') # 让简介好看点  左对齐并留空 def _make_introduce_beautiful(txt, min_length):     try:         return txt.ljust(min_length)     except Exception as e:         print(e)         return ''.ljust(min_length) # 获取工作简介 def fetch_job_introduce(web_type, keywords, page_count, area):     url = ""     decode_type = ""     #根据不同网站设置不同的地址格式     area_need = ""     if web_type == WEBTYPE._51job:         url = "http://search.51job.com/list/{}0000,000000" \               ",0000,00,9,99,{},2,{}.html?&stype=1&postchannel=0000&workyear=99&" \               "cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0" \               "&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="         decode_type = 'gbk'         file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')         with open(file_path, mode='r', encoding='utf-8') as f:             result = f.read()             dic = eval(result)             area_need = '%02d' % dic[area]     elif web_type == WEBTYPE.zhilian:         url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&isadv=0&sg=7e9e61449fd14593a5604fff81aec46a&p={}"         decode_type = "utf-8"     # 实际页数从1开始，所以+1     urls = [url.format(area_need,' '.join(keywords), p+1) for p in range(0, page_count)]     df = fetch_companies(urls, decode_type, web_type)     return df def fetch_companies(urls, decode_type, web_type):     df = pd.DataFrame(columns=['薪酬', '地区', '详细地址', '链接', '工作', '公司', '来源', '要求'])     # 要页数从0开始     for url in urls:         r = requests.get(url, headers=config.http_headers).content.decode(decode_type)         if web_type == WEBTYPE._51job:             bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el")             for b in bs:                 try:                     href, job_name = b.find('a')['href'], b.find('a')['title']                     company_name = b.find('span', class_='t2').text                     locate = b.find('span', class_='t3').text                     salary = b.find('span', class_='t4').text                     dic = {'工作': company_name,                            '地区': locate,                            '详细地址': '',                            '薪酬': salary,                            '公司': job_name,                            '链接': href,                            '来源': web_type.value,                            '要求': ''}                     index = df.shape[0]                     df.loc[index] = dic                     # print(df)                 except Exception as e:                     print(e, "简介解析错误")                     pass         elif web_type == WEBTYPE.zhilian:             bs = BeautifulSoup(r, 'lxml').find(id="newlist_list_content_table").find_all("table",class_="newlist")             for b in bs:                 try:                     # 第一个标签没有信息                     href = b.find("td", class_="zwmc").find("div").find("a")["href"]                     company_name = b.find("td", class_="zwmc").find("div").find("a").text                     job_name = b.find("td", class_='gsmc').find("a").text                     locate = b.find("td", class_="gzdd").text                     salary = b.find("td", class_="zwyx").text                     dic = {'工作': company_name,                            '地区': locate,                            '详细地址': '',                            '薪酬': salary,                            '公司': job_name,                            '链接': href,                            '来源': web_type.value,                            '要求': ''}                     index = df.shape[0]                     df.loc[index] = dic                     # print(df)                 except Exception as e:                     print(e, "简介解析错误")                     pass     return df # 获取工作详情 def fetch_job_detail(df):     for i in  range(0, df.shape[0]):         introduce = df.loc[i]         location, require = _fetch_location_and_require_from_detail(introduce)         df.loc[i, '详细地址'] = location         df.loc[i, '要求'] = require     return df # 获取详细地址与工作要求详情 def _fetch_location_and_require_from_detail(introduce):     global SPIDER_REQUIRE_COUNT     web_type = introduce['来源']     href = introduce['链接']     company_name = introduce['公司']     if web_type == WEBTYPE._51job.value:         SPIDER_REQUIRE_COUNT += 1         print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))         try:             r = requests.get(href, headers=config.http_headers).content.decode("gbk")             location_detail = _fetch_location_from_detail(r, introduce)             bs = BeautifulSoup(r, 'lxml').find('div', class_="bmsg job_msg inbox")             useless_bs1 = bs.find('p', class_='fp')             useless_bs2 = bs.find('div', class_='share')             require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '')\                 .replace("\t", "").replace("\n", "").replace("\r", "")             return location_detail, require         except Exception as e:             print(e, "工作要求解析错误")             return "", ""             pass     elif web_type == WEBTYPE.zhilian.value:         SPIDER_REQUIRE_COUNT += 1         print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))         try:             r = requests.get(href, headers=config.http_headers).content.decode("utf-8")             location_detail = _fetch_location_from_detail(r, introduce)             bs = BeautifulSoup(r, 'lxml').find('div', class_="tab-inner-cont")             useless_bs1 = bs.find('b')             useless_bs2 = bs.find('h2')             useless_bs3 = bs.find(id='applyVacButton1')             require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '').replace(useless_bs3.text, '')\                 .replace("\t", "").replace("\n", "").replace("\r", "")             return location_detail, require         except Exception as e:             print(e, "工作要求解析错误")             return "", ""             pass #获取详细地址 def _fetch_location_from_detail(h5_content, introduce):     """获取公司详细地址"""     web_type = introduce['来源']     if web_type == WEBTYPE._51job.value:         bs = BeautifulSoup(h5_content, 'lxml').find_all('p', class_="fp")         for b in bs:             try:                 location = b.text                 if "上班地址" in location:                     location = location.replace("上班地址：", "").replace("\t", "").replace("\n", "")                     return location             except Exception as e:                 print(e, '上班地址解析错误')                 return introduce['地区']                 pass     elif web_type == WEBTYPE.zhilian.value:         bs = BeautifulSoup(h5_content, 'lxml').find('div', class_="tab-inner-cont")         try:             location = bs.find("h2").text             location = location.replace("\t", "").replace("\n", "").replace("\r", "").replace(" ", "").replace("查看职位地图", "")             return location         except Exception as e:             print(e, '上班地址解析错误')             return introduce['地区']             pass

jobs_data_analyse.py

import os from program import config import pandas as pd import math import jieba import jieba.posseg import csv import matplotlib.pyplot as plt from program.job_spider import * import numpy as np from PIL import Image from collections import Counter from wordcloud import WordCloud pd.set_option('expand_frame_repr', False) def jobs_data_analyse():     df = pd.read_csv(config.jobs_data_path, encoding='utf-8')     df['薪酬'] = df['薪酬'].apply(unify_salary_form)     salary_analyse(df)     require_analyse(df) #统一工资格式 def unify_salary_form(salary):     if type(salary) == float and math.isnan(salary):         return None     month = 1     if salary.endswith('/年'):         month = 12         salary = salary.replace('/年', '')     elif salary.endswith('/月'):         month = 1         salary = salary.replace('/月', '')     multiple = 1     if salary.endswith('千'):         multiple = 1000         salary = salary.replace('千', '')     elif salary.endswith('万'):         multiple = 10000         salary = salary.replace('万', '')     # print(salary)     try:         min = int(float(salary.split('-')[0]) * multiple / month)         max = int(float(salary.split('-')[1]) * multiple / month)         return str(min), str(max), str(min) + '-' + str(max)     except Exception as e:         print(e)         return None #分析薪酬 def salary_analyse(df):     df['low_薪酬'] = df['薪酬'].apply(lambda x: None if(x == None) else int(x[0]))     df['high_薪酬'] = df['薪酬'].apply(lambda x: None if (x == None) else int(x[1]))     print('该行业平均工资为: ', df.dropna(subset=['薪酬'])[['low_薪酬', 'high_薪酬']].mean().mean())     index_max_salary = df['high_薪酬'].idxmax()     index_min_salary = df['low_薪酬'].idxmin()     print('最高薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_max_salary, '公司'], df['high_薪酬'].max(), df.loc[index_max_salary, '链接']))     print('最低薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_min_salary, '公司'], df['low_薪酬'].min(), df.loc[index_min_salary, '链接']))     for area, group in df.dropna(subset=['薪酬']).groupby('地区'):         average_salary = group[['low_薪酬', 'high_薪酬']].mean().mean()         print('该行业在地区:(%s)的平均薪酬为:%d' % (area, average_salary)) #分析要求 def require_analyse(df):     all_require = ''     for require in df['要求']:         if type(require) == float and math.isnan(require):             continue         all_require += require     _require_word_freq(all_require)     _require_word_cloud() def _require_word_freq(all_require):     #设置用户词典     jieba.load_userdict(os.path.join(config.jieba_dir, "user_dict.txt"))     seg_lst = jieba.posseg.cut(all_require)     counter = Counter()     #设置停用词     stopwords_path = os.path.join(config.jieba_dir,"stopwords.txt" )     stopwords = [line.strip() for line in open(stopwords_path, "r", encoding="utf-8").readlines()]     for seg in seg_lst:         if seg.word in stopwords:             continue             #过滤符号         elif seg.flag == 'x':             continue         counter[seg.word] += 1     counter_sorted = sorted(counter.items(), key=lambda value: value[1], reverse=True)     with open(config.jobs_require_word_freq_path, "w+", encoding="utf-8") as f:         f_csv = csv.writer(f)         f_csv.writerows(counter_sorted)         print('词频文件保存成功,地址为：', config.jobs_require_word_freq_path) def _require_word_cloud():     word_freq_dic = dict()     with open(config.jobs_require_word_freq_path, mode='r', encoding='utf-8') as f:         f_csv = csv.reader(f)         # print(f_csv)         for row in f_csv:             word_freq_dic[row[0]] = int(row[1])         # print(word_freq_dic)     #使用图片作为背景生成wordcloud     #这里用alice的图 是从这里得来的http://blog.csdn.net/fontthrone/article/details/72775865     # alice_coloring = np.array(Image.open(config.alice_png))     # wc = WordCloud(font_path=config.wc_font_path, background_color='white', mask = alice_coloring,     #                max_words=150, max_font_size=100, min_font_size=20)\     #     .generate_from_frequencies(word_freq_dic)     wc = WordCloud(font_path=config.wc_font_path,                           max_words=150, height=800, width=1400).generate_from_frequencies(word_freq_dic)     plt.imshow(wc, interpolation="bilinear")     plt.axis('off')     plt.show()     wc.to_file(config.wordcloud_png_path) def start():     check_area_name()     fetch_data(web_type=WEBTYPE.all, keywords=['iOS'], area='深圳', page_count=5)     jobs_data_analyse() start()

使用方法

打开项目文件jobs_data_analyse.py运行，可根据个人需求更改

运行后，就会开始收集数据。

数据爬虫

收集完成后，会对收集来的薪酬数据简要分析。

薪酬分析

最后会根据工作要求生成wordcloud。

深圳iOS的词频

为了方便按地区查看工作，笔者把工作简介放在jobs_data_introduce.csv,客官搜索自己要的地区进行查看。

工作简介

这个demo只是符合笔者需要，仅供参考。

demo地址

作者：luomagaoshou
链接：https://www.jianshu.com/p/c23d7761d79b

點擊查看更多內容

為 TA 點贊

若覺得本文不錯，就分享一下吧！

評論

評論

共同學習，寫下你的評論

評論加載中...

展開查看更多評論

作者其他優質文章

正在加載中

郎朗坤

手記
篇

粉絲

49

獲贊與收藏

213

關注作者，訂閱最新文章

閱讀免費教程

Java并發工具

23個小節 19938 322

Java 并發原理入門教程

26個小節 40386 582

Java 入門教程

50個小節 353174 3272

推薦

評論

收藏

共同學習，寫下你的評論



感謝您的支持，我會繼續努力的～

掃碼打賞，你說多少就多少

贊賞金額會直接到老師賬戶

支付方式

打開微信掃一掃，即可進行掃碼打賞哦

今天注冊有機會得

100積分直接送

付費專欄免費學

大額優惠券免費領

立即參與放棄機會

點擊
抽獎

慕課手記新用戶專享福利

恭喜你，你的運氣太好了，居然抽中了 100個積分！

恭喜你，抽中了價值元的專欄！

太棒了，直接落到你賬戶里！

積分商城里的羅技鼠標、機械鍵盤、
Kindle 閱讀器、小米平衡車
Apple iPad （10.2英寸）、大額優惠券
在等著你去兌換了噢

作者：

免費贈送

兌換碼：1111222211 復制

優惠券可用于購買實戰課、體系課
無門檻使用

先去看看，有什么好東西馬上兌換我愛學習，選課去


亚洲在线久爱草,狠狠天天香蕉网,天天搞日日干久草,伊人亚洲日本欧美

熱搜

最近搜索清空