完全代码如下:
import requestsfrom bs4 import BeautifulSoupimport refrom operator import itemgetterimport timeimport randomimport pandas as pdfrom psycopg2 import Errorimport psycopg2import psycopg2.extras as extrasimport sysfrom io import StringIO,BytesIOfrom psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMITdef remove_col(arr, ith): itg = itemgetter(filter((ith).__ne__, range(len(arr[0])))) return list(map(list, map(itg, arr))) url = 'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodes'heads = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"}resText = requests.get(url)soup = BeautifulSoup(resText.content, features='lxml')# soup = BeautifulSoup(resText.text, "html.parser")s = soup.textprint('\n申万三级分类:') shw3 = s[s.find('sw2_hy'):s.find('sw3_hy')]shw3_cut = shw3[shw3.find('[['):shw3.find(']]')]shw3_cut = re.sub(r'\[','',shw3_cut)shw3_list = shw3_cut.split(']')shw3_list_split = []for i in range(0,len(shw3_list)): item_split = shw3_list[i].split(',') if i == 0: temp_str = item_split[0].encode('utf-8').decode('unicode_escape') item_split[0] = temp_str else: temp_str = item_split[1].encode('utf-8').decode('unicode_escape') item_split[1] = temp_str item_split = item_split[1:4] shw3_list_split.append(item_split) result_shw3 = remove_col(shw3_list_split, 1)print()print('申万三级分类总数:',len(result_shw3))print(result_shw3[0:3])## 申万三级级分类及其各分类下的股票, sw2_730100print('申万三级及其所属股票')shw3_category_and_stocks = []shw3_categorystock = []for i in range(0,len(result_shw3)): #len(result_shw3) url3 = 'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page=1&num=500&sort=symbol&asc=1&node=' + result_shw3[i][1][1:11] + '&symbol=&_s_r_a=init' print(url3,i,result_shw3[i][0],result_shw3[i][1][1:11]) resText3 = requests.get(url3) soup3 = BeautifulSoup(resText3.content, features='lxml') s3 = soup3.text resStr3 = re.sub(r'\[','',s3) resStr3 = re.sub(r'\]','',resStr3) resStr3 = re.sub(r'{','',resStr3) resStr3_list = resStr3.split('}') resStr3_list.pop() # 删除末了一个元素,由于split产生的空元素 shw_three_stocks = [] for j in range(0, len(resStr3_list)): singlestock_info = resStr3_list[j].split(',') if j == 0: rst = [[x for x in ss.split(':')] for ss in singlestock_info] shw_three_stocks.append([rst[0][1][1:len(rst[0][1])-1],rst[1][1][1:len(rst[1][1])-1],rst[2][1][1:len(rst[2][1])-1].encode('utf-8').decode('unicode_escape'),rst[-3][1],rst[-2][1],rst[3][1][1:-1],rst[-1][1][1:-1]]) shw3_categorystock.append([result_shw3[i][0][1:len(result_shw3[i][0])-1], result_shw3[i][1][1:len(result_shw3[i][1])-1], rst[0][1][1:len(rst[0][1])-1], rst[1][1][1:len(rst[1][1])-1], rst[2][1][1:len(rst[2][1])-1].encode('utf-8').decode('unicode_escape'), round(float(rst[-3][1]),2), # 总市值 round(float(rst[-2][1]),2), # 流利市值 round(float(rst[3][1][1:-1]),2), # 最新价 rst[-1][1][1:-1] # 换手率 ]) else: rst = [[x for x in ss.split(':')] for ss in singlestock_info] shw_three_stocks.append([rst[1][1][1:len(rst[1][1])-1],rst[2][1][1:len(rst[2][1])-1],rst[3][1][1:len(rst[3][1])-1].encode('utf-8').decode('unicode_escape'),rst[-3][1],rst[-2][1],rst[4][1][1:-1],rst[-1][1][1:-1]]) shw3_categorystock.append([result_shw3[i][0][1:len(result_shw3[i][0])-1], result_shw3[i][1][1:len(result_shw3[i][1])-1], rst[1][1][1:len(rst[1][1])-1],rst[2][1][1:len(rst[2][1])-1], rst[3][1][1:len(rst[3][1])-1].encode('utf-8').decode('unicode_escape'), round(float(rst[-3][1]),2), # 总市值 round(float(rst[-2][1]),2), # 流利市值 round(float(rst[4][1][1:-1]),2), # 最新价 rst[-1][1][1:-1] # 换手率 ]) tmp_removequotes = [result_shw3[i][0][1:len(result_shw3[i][0])-1],result_shw3[i][1][1:len(result_shw3[i][1])-1]] shw3_category_and_stocks.append([tmp_removequotes,'A',shw_three_stocks]) time.sleep(random.randint(1,6)) #防止抓取页面密集而网站被封print()with open('shw3category_details.csv','w') as fobj: #仅用于测试不雅观察数据 for i in range(0,len(shw3_category_and_stocks)): print(f'{shw3_category_and_stocks[i]},') fobj.write(str(shw3_category_and_stocks[i])+'\n') # print(shw3_category_and_stocks[i][1])print()print('申万三级分类总数:',len(result_shw3))print('申万三级分类总数(包括各分类的股票):',len(shw3_category_and_stocks))shw3_category = [x[0][0] for x in shw3_category_and_stocks] shw3_code = [x[0][1] for x in shw3_category_and_stocks] dict3 = {'shw3_code': shw3_code,'shw3_category': shw3_category} df3 = pd.DataFrame(dict3) df3.to_csv('shenwan3_category.csv',index = False)shw3_category_code = [x[1] for x in shw3_categorystock] shw3_category_name = [x[0] for x in shw3_categorystock] shw3_category_mktcode = [x[2] for x in shw3_categorystock] shw3_stock_code = [x[3] for x in shw3_categorystock] shw3_stock_name = [x[4] for x in shw3_categorystock] stock_mktcap = [x[5] for x in shw3_categorystock] stock_nmc = [x[6] for x in shw3_categorystock] stock_price = [x[7] for x in shw3_categorystock] turnoverratio = [x[8] if not x[8] == '' else '0' for x in shw3_categorystock] dict3 = {'shw3_code': shw3_category_code,'category_name': shw3_category_name,'category_mktcode':shw3_category_mktcode,\ 'stock_code':shw3_stock_code,'stock_name':shw3_stock_name,'stock_mktcap':stock_mktcap,'stock_nmc':stock_nmc,'stock_price':stock_price,'turnoverratio:':turnoverratio} # df3 = pd.DataFrame(dict3) df3.to_csv('shenwan3_category_stocks.csv',index = False)stocks_hsl = list(zip(shw3_category_code,shw3_category_name,shw3_category_mktcode,shw3_stock_code,shw3_stock_name,stock_mktcap,stock_nmc,stock_price,turnoverratio))print('\n 确定是否保存个股换手率数据到数据库 PostgreSQL ')choosing_method = int(input("\n 输入1 - 保存, 输入2 - 不保存, 默认为 2: "))if choosing_method == 1: print("选择 " +str(choosing_method)+' , 保存数据到数据表 stocks_hsl ... ' ) try: conn = psycopg2.connect( host="localhost", database="stockdb", user="postgres", password="" ) # 打开许可批量插入的游标 cur = conn.cursor() # 实行批量插入操作(利用忽略唯一键缺点的办法) sql = "INSERT INTO stocks_hsl " + \ "(shw3_code,category_name,category_mktcode,stock_code,stock_name,stock_mktcap,stock_nmc,stock_price,turnoverratio)" + \ "VALUES (%s, %s,%s, %s,%s, %s,%s, %s,%s) ON CONFLICT DO NOTHING" cur.executemany(sql, stocks_hsl) # 提交事务 conn.commit() except (Exception, Error) as e: print(" 连接 PostgreSQL 时报错!
", e) finally: # 关闭游标和连接 if cur: cur.close() if conn: conn.close() else: print('本次不保存个股换手率数据到数据库') # ['"sz000070"', '"000070"', '"特发信息"', '"5.860"', '0.04', '0.687', '"5.850"', '"5.860"', '"5.820"', # '"5.810"', '"5.860"', '"5.660"', '8478528', '48999967', '2022-06-14', '-7.808', '2.588', '494924.355832', '487960.883258', '1.0182']## 字段解释# "symbol":"sz002281",# "code":"002281",# "name":"\u5149\u8fc5\u79d1\u6280",# "trade":"22.740",# "pricechange":-0.29,# "changepercent":-1.259,# "buy":"22.740",# "sell":"22.750",# "settlement":"23.030", 昨日收盘价# "open":"23.050",# "high":"23.220",# "low":"22.670",# "volume":6874488, 成交量# "amount":157353968, 成交额# "ticktime":"15:00:03",# "per":31.151,# "pb":2.905, 市净率# "mktcap":1590455.879532, 总市值# "nmc":1507701.365214, 流利市值# "turnoverratio":1.03685, 换手率
申万Ⅲ级板块分类,共336类
shw3_code,shw3_categorysw3_240602,镍sw3_720706,其他数字媒体sw3_770302,医美做事sw3_770301,医美耗材sw3_770202,品牌扮装品sw3_770201,扮装品制造及其他sw3_770102,洗护用品sw3_770101,生活用纸sw3_760201,环保设备Ⅲsw3_760104,综合环境管理sw3_760103,固废管理sw3_760102,水务及水管理sw3_760101,大气管理sw3_750303,其他石化sw3_750302,油品石化贸易sw3_750301,炼油化工sw3_750202,油气及炼化工程sw3_750201,油田做事sw3_750101,油气开采Ⅲsw3_740201,焦炭Ⅲsw3_740102,焦煤sw3_740101,动力煤sw3_730207,其他通信设备sw3_730206,通信终端及配件sw3_730205,通信线缆及配套sw3_730204,通信网络设备及器件sw3_730104,通信运用增值做事sw3_730103,通信工程及做事sw3_730102,电信运营商sw3_721001,电视广播Ⅲsw3_720902,大众出版sw3_720901,教诲出版sw3_720705,笔墨媒体sw3_720704,门户网站sw3_720703,图片媒体sw3_720701,视频媒体sw3_720602,院线sw3_720601,影视动漫制作sw3_720502,广告媒体sw3_720501,营销代理sw3_720401,游戏Ⅲsw3_710402,横向通用软件sw3_710401,垂直运用软件sw3_710301,IT做事Ⅲsw3_710103,其他打算机设备sw3_710102,安防设备sw3_650501,军工电子Ⅲsw3_640704,其他自动扮装备sw3_640703,激光设备sw3_640702,工控设备sw3_640701,机器人sw3_640602,工程机器器件sw3_640601,工程机器整机sw3_640108,金属制品sw3_640107,仪器仪表sw3_630805,线缆部件及其他sw3_630804,电工仪器仪表sw3_630803,电网自动扮装备sw3_630802,配电设备sw3_630801,输变电设备sw3_630705,蓄电池及其他电池sw3_630704,燃料电池sw3_630703,锂电专用设备sw3_630702,电池化学品sw3_630701,锂电池sw3_630602,风电零部件sw3_630601,风电整机sw3_630505,光伏加工设备sw3_630504,光伏辅材sw3_630503,逆变器sw3_630502,光伏电池组件sw3_630501,硅料硅片sw3_620601,工程咨询做事Ⅲsw3_620307,园林工程sw3_620306,基建市政工程sw3_610305,涂料sw3_610304,防水材料sw3_610202,玻纤制造sw3_610102,水泥制品sw3_490308,其他多元金融sw3_490307,资产管理sw3_490306,金融信息做事sw3_490305,租赁sw3_490304,相信sw3_490303,期货sw3_490302,金融控股sw3_480501,农商行Ⅲsw3_480401,城商行Ⅲsw3_480301,股份制银行Ⅲsw3_480201,国有大型银行Ⅲsw3_461103,教诲运营及其他sw3_461102,培训教诲sw3_461101,学历教诲sw3_461004,旅游综合sw3_461003,自然景区sw3_461002,人工景区sw3_460902,餐饮sw3_460901,酒店sw3_460804,其他专业做事sw3_460803,会展做事sw3_460802,检测做事sw3_460801,人力资源做事sw3_460601,体育Ⅲsw3_450701,旅游零售Ⅲsw3_450603,电商做事sw3_450602,跨境电商sw3_450601,综合电商sw3_450304,商业物业经营sw3_430302,房产租赁经纪sw3_430301,物业管理sw3_430103,家本地产sw3_430102,商业地产sw3_421102,港口sw3_421101,航运sw3_421002,机场sw3_421001,航空运输sw3_420903,铁路运输sw3_420902,公交sw3_420901,高速公路sw3_420807,公路货运sw3_420806,仓储物流sw3_420805,跨境物流sw3_420804,快递sw3_420803,中间产品及消费品供应链做事sw3_420802,原材料供应链做事sw3_410110,电能综合做事sw3_410109,其他能源发电sw3_410108,核力发电sw3_410107,风力发电sw3_410106,光伏发电sw3_370605,其他医疗做事sw3_370604,医院sw3_370603,医疗研发外包sw3_370602,诊断做事sw3_370504,体外诊断sw3_370503,医疗耗材sw3_370502,医疗设备sw3_370403,线下药店sw3_370402,医药流利sw3_370304,其他生物制品sw3_370303,疫苗sw3_370302,血液制品sw3_360502,娱乐用品sw3_360501,文化用品sw3_360311,其他家居用品sw3_360309,卫浴制品sw3_360308,定制家居sw3_360307,成品家居sw3_360306,瓷砖地板sw3_360206,综合包装sw3_360205,纸包装sw3_360204,塑料包装sw3_360203,金属包装sw3_360202,印刷sw3_360103,特种纸sw3_360102,大宗用纸sw3_350303,其他饰品sw3_350301,钟表珠宝sw3_350209,非运动服装sw3_350208,运动服装sw3_350107,纺织鞋类制造sw3_340901,调味发酵品Ⅲsw3_340803,熟食sw3_340802,烘焙食品sw3_340801,零食sw3_340702,乳品sw3_340701,软饮料sw3_340602,其他酒类sw3_340601,啤酒sw3_340501,白酒Ⅲsw3_340407,保健品sw3_340406,预加工食品sw3_330701,其他家电Ⅲsw3_330601,家电零部件Ⅲsw3_330501,照明设备Ⅲsw3_330402,卫浴电器sw3_330401,厨房电器sw3_330303,个护小家电sw3_330302,清洁小家电sw3_330301,厨房小家电sw3_330106,冰洗sw3_280602,商用载客车sw3_280601,商用载货车sw3_280502,综合乘用车sw3_280501,电动乘用车sw3_280402,摩托车sw3_280303,汽车综合做事sw3_280302,汽车经销商sw3_280206,汽车电子电气系统sw3_280205,其他汽车零部件sw3_280204,轮胎轮毂sw3_280203,底盘与发动机系统sw3_280202,车身附件及饰件sw3_270601,电子化学品Ⅲsw3_270504,消费电子零部件及组装sw3_270503,品牌消费电子sw3_270108,半导体设备sw3_270107,集成电路封测sw3_270106,集成电路制造sw3_270105,仿照芯片设计sw3_270104,数字芯片设计sw3_240603,锂sw3_240601,钴sw3_240505,钼sw3_240402,白银sw3_230501,特钢Ⅲsw3_230403,钢铁管材sw3_230402,板材sw3_230401,长材sw3_230302,冶钢辅料sw3_230301,铁矿石sw3_220901,非金属材料Ⅲsw3_220805,复合肥sw3_220804,钾肥sw3_220803,农药sw3_220802,磷肥及磷化工sw3_220801,氮肥sw3_220604,橡胶助剂sw3_220505,膜材料sw3_220504,合成树脂sw3_220406,锦纶sw3_220317,胶黏剂及胶带sw3_220316,有机硅sw3_220315,食品及饲料添加剂sw3_220206,钛白粉sw3_220205,煤化工sw3_110901,农业综合Ⅲsw3_110704,其他养殖sw3_110703,肉鸡养殖sw3_110702,生猪养殖sw3_110404,宠物食品sw3_110403,水产饲料sw3_110402,畜禽饲料sw3_110104,食用菌sw3_650401,航海装备Ⅲsw3_650301,地面兵装Ⅲsw3_650201,航空装备Ⅲsw3_650101,航天装备Ⅲsw3_640501,轨交设备Ⅲsw3_640209,其他专用设备sw3_640208,印刷包装机器sw3_640207,农用机器sw3_640206,纺织服装设备sw3_640204,楼宇设备sw3_640203,能源及重型设备sw3_640106,其他通用设备sw3_640105,制冷空调设备sw3_640103,磨具磨料sw3_640101,机床工具sw3_630306,其他电源设备Ⅲsw3_630304,火电设备sw3_630301,综合电力设备商sw3_630101,电机Ⅲsw3_620404,其他专业工程sw3_620403,国际工程sw3_620402,化学工程sw3_620401,钢构造sw3_620201,装修装饰Ⅲsw3_620101,房屋培植Ⅲsw3_610303,其他建材sw3_610302,管材sw3_610301,耐火材料sw3_610201,玻璃制造sw3_610101,水泥制造sw3_510101,综合Ⅲsw3_490201,保险Ⅲsw3_490101,证券Ⅲsw3_450401,专业连锁Ⅲsw3_450303,多业态零售sw3_450302,超市sw3_450301,百货sw3_450201,贸易Ⅲsw3_430101,住宅开拓sw3_410301,燃气Ⅲsw3_410104,热力做事sw3_410102,水力发电sw3_410101,火力发电sw3_370201,中药Ⅲsw3_370102,化学制剂sw3_370101,质料药sw3_350206,家纺sw3_350205,鞋帽及其他sw3_350106,其他纺织sw3_350105,辅料sw3_350104,印染sw3_350102,棉纺sw3_340401,肉制品sw3_330202,其他玄色家电sw3_330201,彩电sw3_330102,空调sw3_280401,其他运输设备sw3_270401,其他电子Ⅲsw3_270303,光学元件sw3_270302,LEDsw3_270301,面板sw3_270203,被动元件sw3_270202,印制电路板sw3_270103,半导体材料sw3_270102,分立器件sw3_240504,其他小金属sw3_240502,钨sw3_240501,稀土sw3_240401,黄金sw3_240303,铅锌sw3_240302,铜sw3_240301,铝sw3_240202,磁性材料sw3_240201,其他金属新材料sw3_220603,炭黑sw3_220602,其他橡胶制品sw3_220503,改性塑料sw3_220501,其他塑料制品sw3_220405,氨纶sw3_220404,其他化学纤维sw3_220403,粘胶sw3_220401,涤纶sw3_220313,聚氨酯sw3_220311,氟化工sw3_220309,其他化学制品sw3_220308,纺织化学制品sw3_220307,民爆制品sw3_220305,涂料油墨sw3_220204,其他化学质料sw3_220203,无机盐sw3_220202,氯碱sw3_220201,纯碱sw3_110801,动物保健Ⅲsw3_110504,其他农产品加工sw3_110502,粮油加工sw3_110501,果蔬加工sw3_110301,林业Ⅲsw3_110202,水产养殖sw3_110201,海洋捕捞sw3_110103,其他栽种业sw3_110102,粮食栽种sw3_110101,种子
