找工作啊找工作
年关将至,相信很多小伙伴在寻思着发展方向。不知有没小伙伴跟笔者找工作时狂刷简历,为的是找出离家近点的公司,但是有很多公司的简介上是没有地址的,所以也只能再点进去看公司详细地址。
因此,写了个爬虫,方便找工作。
项目简介
主要代码是一个job_spider.py(用于爬虫), jobs_data_analyse.py(用于工作数据分析)
笔者先获取工作列表,取得简介后再取得详情。下载完成后,再进行分析。
详细代码如下
job_spider.py
from bs4 import BeautifulSoup import requests import os from enum import Enum from program import config import pandas as pd pd.set_option('expand_frame_repr', False) # 列太多不换行 class WEBTYPE(Enum): _51job = '_51job' # 51job zhilian = 'zhilian' # 智联 all = 3 # 所有 #全局变量 记录爬虫次数 SPIDER_REQUIRE_COUNT = 0 #获取51job地址编号对应地名 def get_51job_area_code(): dic = {} for i in range(1, 37): url = 'http://search.51job.com/list/{}0000,000000,0000,00,9,99,ios,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format('%02d' % i) r = requests.get(url, headers=config.http_headers).content.decode('gbk') area_name = BeautifulSoup(r, 'lxml').find(id="work_position_input")['value'] print(area_name, i) dic[area_name] = i file_path = os.path.join(config.job_data_dir, '51job_area_code.txt') print('51job地区编号文件获取成功') with open(file_path, "w+", encoding="utf-8") as f: f.write(str(dic)) f.close() # 检查本地是否有51job地区编号 没有的话就自动获取 def check_area_name(): file_path = os.path.join(config.job_data_dir, '51job_area_code.txt') if os.path.exists(file_path): with open(file_path, "r", encoding="utf-8") as f: if f: result = f.read() dic = eval(result) f.close() else: print('51job缺少地区编号文件,获取中') get_51job_area_code() check_area_name() def fetch_data( web_type=WEBTYPE.all, keywords=['iOS'], page_count=5, area='深圳'): if os.path.exists(config.jobs_data_path): os.remove(config.jobs_data_path) print('删除之前爬的数据') if web_type == WEBTYPE._51job: _fetch_data(web_type, keywords, page_count, area) elif web_type == WEBTYPE.zhilian: _fetch_data(web_type, keywords, page_count, area) elif web_type == WEBTYPE.all: for type in list(WEBTYPE)[0: -1]: _fetch_data(type, keywords, page_count, area) def _fetch_data(web_type, keywords, page_count, area): df = fetch_job_introduce(web_type, keywords, page_count, area) df = fetch_job_detail(df) df.fillna(value='', inplace=True) if os.path.exists(config.jobs_data_path): df_existed = pd.read_csv(config.jobs_data_path, encoding='utf-8', index_col=0) df = df.append(df_existed, ignore_index=True) df.sort_values(by=['地区'], inplace=True) df.reset_index(drop=True, inplace=True) df.to_csv(config.jobs_data_path, mode='w', encoding='utf-8') #去除工作要求 方便查看 df_no_require = df.drop(['要求'], axis=1) df_no_require['薪酬'] = df_no_require['薪酬'].apply(_make_introduce_beautiful, min_length=12) df_no_require['地区'] = df_no_require['地区'].apply(_make_introduce_beautiful, min_length=12) df_no_require['详细地址'] = df_no_require['详细地址'].apply(_make_introduce_beautiful, min_length=30) df_no_require['链接'] = df_no_require['链接'].apply(_make_introduce_beautiful, min_length=60) df_no_require.to_csv(config.jobs_data_introduce_path, mode='w', encoding='utf-8') # 让简介好看点 左对齐并留空 def _make_introduce_beautiful(txt, min_length): try: return txt.ljust(min_length) except Exception as e: print(e) return ''.ljust(min_length) # 获取工作简介 def fetch_job_introduce(web_type, keywords, page_count, area): url = "" decode_type = "" #根据不同网站设置不同的地址格式 area_need = "" if web_type == WEBTYPE._51job: url = "http://search.51job.com/list/{}0000,000000" \ ",0000,00,9,99,{},2,{}.html?&stype=1&postchannel=0000&workyear=99&" \ "cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0" \ "&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" decode_type = 'gbk' file_path = os.path.join(config.job_data_dir, '51job_area_code.txt') with open(file_path, mode='r', encoding='utf-8') as f: result = f.read() dic = eval(result) area_need = '%02d' % dic[area] elif web_type == WEBTYPE.zhilian: url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&isadv=0&sg=7e9e61449fd14593a5604fff81aec46a&p={}" decode_type = "utf-8" # 实际页数从1开始,所以+1 urls = [url.format(area_need,' '.join(keywords), p+1) for p in range(0, page_count)] df = fetch_companies(urls, decode_type, web_type) return df def fetch_companies(urls, decode_type, web_type): df = pd.DataFrame(columns=['薪酬', '地区', '详细地址', '链接', '工作', '公司', '来源', '要求']) # 要页数从0开始 for url in urls: r = requests.get(url, headers=config.http_headers).content.decode(decode_type) if web_type == WEBTYPE._51job: bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el") for b in bs: try: href, job_name = b.find('a')['href'], b.find('a')['title'] company_name = b.find('span', class_='t2').text locate = b.find('span', class_='t3').text salary = b.find('span', class_='t4').text dic = {'工作': company_name, '地区': locate, '详细地址': '', '薪酬': salary, '公司': job_name, '链接': href, '来源': web_type.value, '要求': ''} index = df.shape[0] df.loc[index] = dic # print(df) except Exception as e: print(e, "简介解析错误") pass elif web_type == WEBTYPE.zhilian: bs = BeautifulSoup(r, 'lxml').find(id="newlist_list_content_table").find_all("table",class_="newlist") for b in bs: try: # 第一个标签没有信息 href = b.find("td", class_="zwmc").find("div").find("a")["href"] company_name = b.find("td", class_="zwmc").find("div").find("a").text job_name = b.find("td", class_='gsmc').find("a").text locate = b.find("td", class_="gzdd").text salary = b.find("td", class_="zwyx").text dic = {'工作': company_name, '地区': locate, '详细地址': '', '薪酬': salary, '公司': job_name, '链接': href, '来源': web_type.value, '要求': ''} index = df.shape[0] df.loc[index] = dic # print(df) except Exception as e: print(e, "简介解析错误") pass return df # 获取工作详情 def fetch_job_detail(df): for i in range(0, df.shape[0]): introduce = df.loc[i] location, require = _fetch_location_and_require_from_detail(introduce) df.loc[i, '详细地址'] = location df.loc[i, '要求'] = require return df # 获取详细地址与工作要求详情 def _fetch_location_and_require_from_detail(introduce): global SPIDER_REQUIRE_COUNT web_type = introduce['来源'] href = introduce['链接'] company_name = introduce['公司'] if web_type == WEBTYPE._51job.value: SPIDER_REQUIRE_COUNT += 1 print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href)) try: r = requests.get(href, headers=config.http_headers).content.decode("gbk") location_detail = _fetch_location_from_detail(r, introduce) bs = BeautifulSoup(r, 'lxml').find('div', class_="bmsg job_msg inbox") useless_bs1 = bs.find('p', class_='fp') useless_bs2 = bs.find('div', class_='share') require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '')\ .replace("\t", "").replace("\n", "").replace("\r", "") return location_detail, require except Exception as e: print(e, "工作要求解析错误") return "", "" pass elif web_type == WEBTYPE.zhilian.value: SPIDER_REQUIRE_COUNT += 1 print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href)) try: r = requests.get(href, headers=config.http_headers).content.decode("utf-8") location_detail = _fetch_location_from_detail(r, introduce) bs = BeautifulSoup(r, 'lxml').find('div', class_="tab-inner-cont") useless_bs1 = bs.find('b') useless_bs2 = bs.find('h2') useless_bs3 = bs.find(id='applyVacButton1') require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '').replace(useless_bs3.text, '')\ .replace("\t", "").replace("\n", "").replace("\r", "") return location_detail, require except Exception as e: print(e, "工作要求解析错误") return "", "" pass #获取详细地址 def _fetch_location_from_detail(h5_content, introduce): """获取公司详细地址""" web_type = introduce['来源'] if web_type == WEBTYPE._51job.value: bs = BeautifulSoup(h5_content, 'lxml').find_all('p', class_="fp") for b in bs: try: location = b.text if "上班地址" in location: location = location.replace("上班地址:", "").replace("\t", "").replace("\n", "") return location except Exception as e: print(e, '上班地址解析错误') return introduce['地区'] pass elif web_type == WEBTYPE.zhilian.value: bs = BeautifulSoup(h5_content, 'lxml').find('div', class_="tab-inner-cont") try: location = bs.find("h2").text location = location.replace("\t", "").replace("\n", "").replace("\r", "").replace(" ", "").replace("查看职位地图", "") return location except Exception as e: print(e, '上班地址解析错误') return introduce['地区'] pass
jobs_data_analyse.py
import os from program import config import pandas as pd import math import jieba import jieba.posseg import csv import matplotlib.pyplot as plt from program.job_spider import * import numpy as np from PIL import Image from collections import Counter from wordcloud import WordCloud pd.set_option('expand_frame_repr', False) def jobs_data_analyse(): df = pd.read_csv(config.jobs_data_path, encoding='utf-8') df['薪酬'] = df['薪酬'].apply(unify_salary_form) salary_analyse(df) require_analyse(df) #统一工资格式 def unify_salary_form(salary): if type(salary) == float and math.isnan(salary): return None month = 1 if salary.endswith('/年'): month = 12 salary = salary.replace('/年', '') elif salary.endswith('/月'): month = 1 salary = salary.replace('/月', '') multiple = 1 if salary.endswith('千'): multiple = 1000 salary = salary.replace('千', '') elif salary.endswith('万'): multiple = 10000 salary = salary.replace('万', '') # print(salary) try: min = int(float(salary.split('-')[0]) * multiple / month) max = int(float(salary.split('-')[1]) * multiple / month) return str(min), str(max), str(min) + '-' + str(max) except Exception as e: print(e) return None #分析薪酬 def salary_analyse(df): df['low_薪酬'] = df['薪酬'].apply(lambda x: None if(x == None) else int(x[0])) df['high_薪酬'] = df['薪酬'].apply(lambda x: None if (x == None) else int(x[1])) print('该行业平均工资为: ', df.dropna(subset=['薪酬'])[['low_薪酬', 'high_薪酬']].mean().mean()) index_max_salary = df['high_薪酬'].idxmax() index_min_salary = df['low_薪酬'].idxmin() print('最高薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_max_salary, '公司'], df['high_薪酬'].max(), df.loc[index_max_salary, '链接'])) print('最低薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_min_salary, '公司'], df['low_薪酬'].min(), df.loc[index_min_salary, '链接'])) for area, group in df.dropna(subset=['薪酬']).groupby('地区'): average_salary = group[['low_薪酬', 'high_薪酬']].mean().mean() print('该行业在地区:(%s)的平均薪酬为:%d' % (area, average_salary)) #分析要求 def require_analyse(df): all_require = '' for require in df['要求']: if type(require) == float and math.isnan(require): continue all_require += require _require_word_freq(all_require) _require_word_cloud() def _require_word_freq(all_require): #设置用户词典 jieba.load_userdict(os.path.join(config.jieba_dir, "user_dict.txt")) seg_lst = jieba.posseg.cut(all_require) counter = Counter() #设置停用词 stopwords_path = os.path.join(config.jieba_dir,"stopwords.txt" ) stopwords = [line.strip() for line in open(stopwords_path, "r", encoding="utf-8").readlines()] for seg in seg_lst: if seg.word in stopwords: continue #过滤符号 elif seg.flag == 'x': continue counter[seg.word] += 1 counter_sorted = sorted(counter.items(), key=lambda value: value[1], reverse=True) with open(config.jobs_require_word_freq_path, "w+", encoding="utf-8") as f: f_csv = csv.writer(f) f_csv.writerows(counter_sorted) print('词频文件保存成功,地址为:', config.jobs_require_word_freq_path) def _require_word_cloud(): word_freq_dic = dict() with open(config.jobs_require_word_freq_path, mode='r', encoding='utf-8') as f: f_csv = csv.reader(f) # print(f_csv) for row in f_csv: word_freq_dic[row[0]] = int(row[1]) # print(word_freq_dic) #使用图片作为背景生成wordcloud #这里用alice的图 是从这里得来的http://blog.csdn.net/fontthrone/article/details/72775865 # alice_coloring = np.array(Image.open(config.alice_png)) # wc = WordCloud(font_path=config.wc_font_path, background_color='white', mask = alice_coloring, # max_words=150, max_font_size=100, min_font_size=20)\ # .generate_from_frequencies(word_freq_dic) wc = WordCloud(font_path=config.wc_font_path, max_words=150, height=800, width=1400).generate_from_frequencies(word_freq_dic) plt.imshow(wc, interpolation="bilinear") plt.axis('off') plt.show() wc.to_file(config.wordcloud_png_path) def start(): check_area_name() fetch_data(web_type=WEBTYPE.all, keywords=['iOS'], area='深圳', page_count=5) jobs_data_analyse() start()
使用方法
打开项目文件jobs_data_analyse.py运行,可根据个人需求更改
运行后,就会开始收集数据。
数据爬虫
收集完成后,会对收集来的薪酬数据简要分析。
薪酬分析
最后会根据工作要求生成wordcloud。
深圳iOS的词频
为了方便按地区查看工作,笔者把工作简介放在jobs_data_introduce.csv,客官搜索自己要的地区进行查看。
工作简介
这个demo只是符合笔者需要,仅供参考。
作者:luomagaoshou
链接:https://www.jianshu.com/p/c23d7761d79b
點擊查看更多內容
為 TA 點贊
評論
評論
共同學習,寫下你的評論
評論加載中...
作者其他優質文章
正在加載中
感謝您的支持,我會繼續努力的~
掃碼打賞,你說多少就多少
贊賞金額會直接到老師賬戶
支付方式
打開微信掃一掃,即可進行掃碼打賞哦