В общем сохраняю json ответ в сsv, получается как на картинке.
В итоге хочу достигнуть такой результат. Подскажите как реализовать(желательно с примером), или литературу какую-нибудь.
Вот сам код.
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from random import choice
from time import sleep
from random import uniform
import json
import csv
def get_html(url,useragent,proxy,StartFroms):
#print("get_html")
#print(StartFroms)
#print(proxy)
#print(useragent)
s = requests.Session()
s.get("http://toto-info.co",proxies = proxy)
pl = {"options": {"DrawingId": 632, "StartFrom":StartFroms, "Count": 20, "SortField": "CouponCode", "SortDir": "ASC"}}
res = s.post(url, headers={'User-Agent':useragent,
'Accept-Language' : 'ru,en;q=0.8',
'Accept-Encoding' : 'gzip, deflate, sdch',
'Connection': 'keep - alive',
'Content-Type': 'application/json',
'Host': 'old.toto-info.co',
'Origin': 'http://toto-info.co',
'Referer': 'http://toto-info.co/'},proxies = proxy, data=json.dumps(pl))
data = res.json()
with open('response.csv', 'w', encoding='utf-8') as file:
json.dump(data, file, indent=2, ensure_ascii=False)
print('------------------------------------------')
def main():
url = 'http://old.toto-info.co/DataService.svc/GetMaxPrizeCoupons'
useragents = open("useragents.txt").read().split('
')
proxies = open("proxies.txt").read().split('
')
start = 0 # переменная для хранения точки входа в for
finish = 1000
while start < finish:
try:
for i in range(start, finish, 20):
a = (uniform(1, 2))
sleep(a)
StartFroms = i
useragent = "'" + choice(useragents) + "'"
proxy = {'http': 'http://' + choice(proxies)}
get_html(url, useragent, proxy, StartFroms)
except:
start = i # перезапускаем for c точки исключения
if __name__ == '__main__':
main()
Ответ
Вот рабочий скрипт. Для теста я указал finish = 100 (чтобы быстрее отработало):
import requests
from random import choice
from time import sleep
from random import uniform
import csv
import json
import pandas as pd
def get_html(url, useragent, proxy, StartFroms):
s = requests.Session()
s.get("http://toto-info.co")
pl = {"options": {"DrawingId": 632, "StartFrom":StartFroms, "Count": 20, "SortField": "CouponCode", "SortDir": "ASC"}}
resp = s.post(url, headers={'User-Agent':useragent,
'Accept-Language' : 'ru,en;q=0.8',
'Accept-Encoding' : 'gzip, deflate, sdch',
'Connection': 'keep - alive',
'Content-Type': 'application/json',
'Host': 'old.toto-info.co',
'Origin': 'http://toto-info.co',
'Referer': 'http://toto-info.co/'},proxies = proxy, data=json.dumps(pl))
return resp.json()['d']['Items']
def main():
url = 'http://old.toto-info.co/DataService.svc/GetMaxPrizeCoupons'
useragents = open(r"D:\download\useragents.txt").read().split('
')
proxies = open(r"D:\download\proxies.txt").read().split('
')
start = 0 # переменная для хранения точки входа в for
finish = 100
data = []
while start < finish:
try:
sleep(uniform(1, 2))
useragent = "'{}'".format(choice(useragents))
proxy = {'http': 'http://{}'.format(choice(proxies))}
print('processing:\t[{}] ...'.format(start))
data += get_html(url, useragent, proxy, start)
start += 20
except Exception as e:
print('Exception:\t{}'.format(str(e)))
print('building DataFrame ...')
df = pd.DataFrame(data)
# debug: print first 5 rows of DF...
#print(df.head())
out_fn = r'd:/temp/result.xlsx'
df.set_index('CouponCode')['Options'] \
.str.extractall(r'\d+-\((.*?)\)')[0] \
.unstack().reset_index().rename_axis(None, 1) \
.to_excel(out_fn, index=False)
if __name__ == '__main__':
main()
Комментариев нет:
Отправить комментарий