提交 5f860747 authored 作者: 黄泳齐's avatar 黄泳齐

汽车之家爬虫

上级
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
<option value="N802" />
</list>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/carhome.iml" filepath="$PROJECT_DIR$/.idea/carhome.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
\ No newline at end of file
import gc
import os
import bs4
import requests as req
from selenium import webdriver
from selenium.webdriver.common.by import By
import json
import re
import pymysql
import time
import random
class Crack():
def __init__(self, keyword, username, passod):
self.url = 'https://www.baidu.com'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
self.browser = webdriver.Chrome('/home/wolf/chromedriver', options=chrome_options)
def getSqlConnect():
conn = pymysql.connect(
host='localhost',
user='root',
password='123456',
database='carhome',
charset='utf8',
port=3306,
)
return conn
def getRandNum():
return str(int(time.time())) + str(random.randint(100000000,999999999))
def handleData(carItem, n, year):
conn = getSqlConnect()
cursor = conn.cursor()
car_master_brand_name = carItem['一级分类'][n]
car_brand_name = carItem['二级分类'][n]
car_series_name = carItem['三级分类'][n]
first_letter = carItem['字母'][n]
car_new_name = carItem['车型名称'][n]
car_year = year
auto_home_id = carItem['ID'][n]
if '级别' in carItem.keys():
level = carItem['级别'][n]
else:
level = ''
if '最高车速(km/h)' in carItem.keys():
max_power = carItem['最高车速(km/h)'][n]
else:
max_power = ''
if '官方0-100km/h加速(s)' in carItem.keys():
acceleration_0_100 = carItem['官方0-100km/h加速(s)'][n]
else:
acceleration_0_100 = ''
if 'WLTC综合油耗(L/100km)' in carItem.keys():
wltc_oil_consumption = carItem['WLTC综合油耗(L/100km)'][n]
else:
wltc_oil_consumption = ''
if '整车质保' in carItem.keys():
car_warranty = carItem['整车质保'][n]
else:
car_warranty = ''
if '长度(mm)' in carItem.keys():
length = carItem['长度(mm)'][n]
else:
length = ''
if '宽度(mm)' in carItem.keys():
width = carItem['宽度(mm)'][n]
else:
width = ''
if '高度(mm)' in carItem.keys():
height = carItem['高度(mm)'][n]
else:
height = ''
if '轴距(mm)' in carItem.keys():
wheelbase = carItem['轴距(mm)'][n]
else:
wheelbase = ''
if '前轮距(mm)' in carItem.keys():
front_track_width = carItem['前轮距(mm)'][n]
else:
front_track_width = ''
if '后轮距(mm)' in carItem.keys():
back_track_width = carItem['后轮距(mm)'][n]
else:
back_track_width = ''
if '车门数(个)' in carItem.keys():
car_door_num = carItem['车门数(个)'][n]
else:
car_door_num = ''
if '座位数(个)' in carItem.keys():
car_seat_num = carItem['座位数(个)'][n]
else:
car_seat_num = ''
if '油箱容积(L)' in carItem.keys():
tank_capacity = carItem['油箱容积(L)'][n]
else:
tank_capacity = ''
if '后备厢容积(L)' in carItem.keys():
back_up_capacity = carItem['后备厢容积(L)'][n]
else:
back_up_capacity = ''
if '整备质量(kg)' in carItem.keys():
curb_weight = carItem['整备质量(kg)'][n]
else:
curb_weight = ''
if '气缸排列形式' in carItem.keys():
cylinder_arrange_form = carItem['气缸排列形式'][n]
else:
cylinder_arrange_form = ''
if '气缸数(个)' in carItem.keys():
cylinder_num = carItem['气缸数(个)'][n]
else:
cylinder_num = ''
if '每缸气门数(个)' in carItem.keys():
valve_train_num = carItem['每缸气门数(个)'][n]
else:
valve_train_num = ''
if '配气机构' in carItem.keys():
valve_system = carItem['配气机构'][n]
else:
valve_system = ''
if '最大马力(Ps)' in carItem.keys():
max_horsepower = carItem['最大马力(Ps)'][n]
else:
max_horsepower = ''
if '最大功率转速(rpm)' in carItem.keys():
max_power_speed = carItem['最大功率转速(rpm)'][n]
else:
max_power_speed = ''
if '最大扭矩转速(rpm)' in carItem.keys():
max_torque_speed = carItem['最大扭矩转速(rpm)'][n]
else:
max_torque_speed = ''
if '最大净功率(kW)' in carItem.keys():
max_net_power = carItem['最大净功率(kW)'][n]
else:
max_net_power = ''
if '燃油标号' in carItem.keys():
roz = carItem['燃油标号'][n]
else:
roz = ''
if '供油方式' in carItem.keys():
oil_supply_method = carItem['供油方式'][n]
else:
oil_supply_method = ''
if '缸盖材料' in carItem.keys():
cylinder_head_material = carItem['缸盖材料'][n]
else:
cylinder_head_material = ''
if '缸体材料' in carItem.keys():
cylinder_material = carItem['缸体材料'][n]
else:
cylinder_material = ''
if '挡位个数' in carItem.keys():
gearshift_num = carItem['挡位个数'][n]
else:
gearshift_num = ''
if '前悬架类型' in carItem.keys():
front_suspension_type = carItem['前悬架类型'][n]
else:
front_suspension_type = ''
if '后悬架类型' in carItem.keys():
back_suspension_type = carItem['后悬架类型'][n]
else:
back_suspension_type = ''
if '助力类型' in carItem.keys():
assistance_type = carItem['助力类型'][n]
else:
assistance_type = ''
if '车体结构' in carItem.keys():
car_body_structure = carItem['车体结构'][n]
else:
car_body_structure = ''
if '备胎规格' in carItem.keys():
spare_tire_specifications = carItem['备胎规格'][n]
else:
spare_tire_specifications = ''
if 'ABS防抱死' in carItem.keys():
if carItem['ABS防抱死'][n] == '●':
abs = 3
elif carItem['ABS防抱死'][n] == '○':
abs = 2
elif carItem['ABS防抱死'][n] == '-':
abs = 1
else:
abs = 1
else:
abs = 1
if '制动力分配(EBD/CBC等)' in carItem.keys():
if carItem['制动力分配(EBD/CBC等)'][n] == '●':
brake_force_distribution = 3
elif carItem['制动力分配(EBD/CBC等)'][n] == '○':
brake_force_distribution = 2
elif carItem['制动力分配(EBD/CBC等)'][n] == '-':
brake_force_distribution = 1
else:
brake_force_distribution = 1
else:
brake_force_distribution = 1
if '刹车辅助(EBA/BAS/BA等)' in carItem.keys():
if carItem['刹车辅助(EBA/BAS/BA等)'][n] == '●':
brake_assist = 3
elif carItem['刹车辅助(EBA/BAS/BA等)'][n] == '○':
brake_assist = 2
elif carItem['刹车辅助(EBA/BAS/BA等)'][n] == '-':
brake_assist = 1
else:
brake_assist = 1
else:
brake_assist = 1
if '牵引力控制(ASR/TCS/TRC等)' in carItem.keys():
if carItem['牵引力控制(ASR/TCS/TRC等)'][n] == '●':
traction_control = 3
elif carItem['牵引力控制(ASR/TCS/TRC等)'][n] == '○':
traction_control = 2
elif carItem['牵引力控制(ASR/TCS/TRC等)'][n] == '-':
traction_control = 1
else:
traction_control = 1
else:
traction_control = 1
if '车身稳定控制(ESC/ESP/DSC等)' in carItem.keys():
if carItem['车身稳定控制(ESC/ESP/DSC等)'][n] == '●':
stability_control = 3
elif carItem['车身稳定控制(ESC/ESP/DSC等)'][n] == '○':
stability_control = 2
elif carItem['车身稳定控制(ESC/ESP/DSC等)'][n] == '-':
stability_control = 1
else:
stability_control = 1
else:
stability_control = 1
if '发动机启停技术' in carItem.keys():
if carItem['发动机启停技术'][n] == '●':
engine_start_stop = 3
elif carItem['发动机启停技术'][n] == '○':
engine_start_stop = 2
elif carItem['发动机启停技术'][n] == '-':
engine_start_stop = 1
else:
engine_start_stop = 1
else:
engine_start_stop = 1
if '自动驻车' in carItem.keys():
if carItem['自动驻车'][n] == '●':
auto_park = 3
elif carItem['自动驻车'][n] == '○':
auto_park = 2
elif carItem['自动驻车'][n] == '-':
auto_park = 1
else:
auto_park = 1
else:
auto_park = 1
if '上坡辅助' in carItem.keys():
if carItem['上坡辅助'][n] == '●':
hill_start_assist = 3
elif carItem['上坡辅助'][n] == '○':
hill_start_assist = 2
elif carItem['上坡辅助'][n] == '-':
hill_start_assist = 1
else:
hill_start_assist = 1
else:
hill_start_assist = 1
if '卫星导航系统' in carItem.keys():
if carItem['卫星导航系统'][n] == '●':
gps = 3
elif carItem['卫星导航系统'][n] == '○':
gps = 2
elif carItem['卫星导航系统'][n] == '-':
gps = 1
else:
gps = 1
else:
gps = 1
if '并线辅助' in carItem.keys():
if carItem['并线辅助'][n] == '●':
parallel_assistance = 3
elif carItem['并线辅助'][n] == '○':
parallel_assistance = 2
elif carItem['并线辅助'][n] == '-':
parallel_assistance = 1
else:
parallel_assistance = 1
else:
parallel_assistance = 1
if '自动泊车入位' in carItem.keys():
if carItem['自动泊车入位'][n] == '●':
auto_park_entry = 3
elif carItem['自动泊车入位'][n] == '○':
auto_park_entry = 2
elif carItem['自动泊车入位'][n] == '-':
auto_park_entry = 1
else:
auto_park_entry = 1
else:
auto_park_entry = 1
if '发动机电子防盗' in carItem.keys():
if carItem['发动机电子防盗'][n] == '●':
engine_anti_theft = 3
elif carItem['发动机电子防盗'][n] == '○':
engine_anti_theft = 2
elif carItem['发动机电子防盗'][n] == '-':
engine_anti_theft = 1
else:
engine_anti_theft = 1
else:
engine_anti_theft = 1
if '后排杯架' in carItem.keys():
if carItem['后排杯架'][n] == '●':
back_cup_holder = 3
elif carItem['后排杯架'][n] == '○':
back_cup_holder = 2
elif carItem['后排杯架'][n] == '-':
back_cup_holder = 1
else:
back_cup_holder = 1
else:
back_cup_holder = 1
if '后座出风口' in carItem.keys():
if carItem['后座出风口'][n] == '●':
back_outlet = 3
elif carItem['后座出风口'][n] == '○':
back_outlet = 2
elif carItem['后座出风口'][n] == '-':
back_outlet = 1
else:
back_outlet = 1
else:
back_outlet = 1
if '胎压监测功能' in carItem.keys():
if carItem['胎压监测功能'][n] == '':
tpms = 3
elif carItem['胎压监测功能'][n] == '○':
tpms = 2
elif carItem['胎压监测功能'][n] == '-':
tpms = 1
else:
tpms = 1
else:
tpms = 1
if '能源类型' in carItem.keys():
car_energy_type = carItem['能源类型'][n]
else:
car_energy_type = ''
if '环保标准' in carItem.keys():
car_emission_standard_name = carItem['环保标准'][n]
sql = "select car_emission_standard_id from car_emission_standard where car_emission_standard_name = '%s'"%(car_emission_standard_name)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
car_emission_standard_id = result[0]
else:
sql = "insert into car_emission_standard (car_emission_standard_id, car_emission_standard_name) values (%s,%s)"
car_emission_standard_id = getRandNum()
val = (car_emission_standard_id, car_emission_standard_name)
cursor.execute(sql, val)
else:
car_emission_standard_name = ''
car_emission_standard_id = 0
if '上市时间' in carItem.keys():
market_time = carItem['上市时间'][n]
else:
market_time = ''
if '发动机' in carItem.keys():
car_engine = carItem['发动机'][n]
else:
car_engine = ''
if '变速箱' in carItem.keys():
car_tms = carItem['变速箱'][n]
else:
car_tms = ''
if '车身结构' in carItem.keys():
car_structure = carItem['车身结构'][n]
sql = "select car_structure_id from car_structure where car_structure_name = '%s'"%(car_structure)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
car_structure_id = result[0]
else:
sql = "insert into car_structure (car_structure_id, car_structure_name) values (%s,%s)"
car_structure_id = getRandNum()
val = (car_structure_id, car_structure)
cursor.execute(sql, val)
else:
car_structure = ''
car_structure_id = 0
if '发动机型号' in carItem.keys():
car_engine_model = carItem['发动机型号'][n]
else:
car_engine_model = ''
if '排量(L)' in carItem.keys():
car_dpm = carItem['排量(L)'][n]
sql = "select car_dpm_id from car_dpm where car_dpm_name = '%s'"%(car_dpm)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
car_dpm_id = result[0]
else:
sql = "insert into car_dpm (car_dpm_id, car_dpm_name) values (%s,%s)"
car_dpm_id = getRandNum()
val = (car_dpm_id, car_dpm)
cursor.execute(sql, val)
else:
car_dpm = ''
car_dpm_id = 0
if '驱动方式' in carItem.keys():
car_drive = carItem['驱动方式'][n]
else:
car_drive = ''
if '前制动器类型' in carItem.keys():
front_brake_type = carItem['前制动器类型'][n]
else:
front_brake_type = ''
if '后制动器类型' in carItem.keys():
back_brake_type = carItem['后制动器类型'][n]
else:
back_brake_type = ''
if '驻车制动类型' in carItem.keys():
parking_brake_type = carItem['驻车制动类型'][n]
else:
parking_brake_type = ''
if '前轮胎规格' in carItem.keys():
front_tire_size = carItem['前轮胎规格'][n]
else:
front_tire_size = ''
if '后轮胎规格' in carItem.keys():
back_tire_size = carItem['后轮胎规格'][n]
else:
back_tire_size = ''
if '进气形式' in carItem.keys():
if carItem['进气形式'][n] == '自然吸气':
engine_type = 1
elif carItem['进气形式'][n] == '涡轮增压':
engine_type = 2
elif carItem['进气形式'][n] == '涡轮增压+电动增压':
engine_type = 3
elif carItem['进气形式'][n] == '四涡轮增压':
engine_type = 4
elif carItem['进气形式'][n] == '双涡轮增压':
engine_type = 5
elif carItem['进气形式'][n] == '机械增压':
engine_type = 6
elif carItem['进气形式'][n] == '机械+涡轮增压':
engine_type = 7
elif carItem['进气形式'][n] == '-':
engine_type = 0
else:
engine_type = 0
else:
engine_type = 0
sql = "select car_master_brand_id from car_master_brand where car_master_brand_name = '%s'"%(car_master_brand_name)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
car_master_brand_id = result[0]
else:
sql = "insert into car_master_brand (car_master_brand_id, car_master_brand_name) values (%s,%s)"
car_master_brand_id = getRandNum()
val = (car_master_brand_id, car_master_brand_name)
cursor.execute(sql, val)
sql = "select car_brand_id from car_brand where car_brand_name = '%s'"%(car_brand_name)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
car_brand_id = result[0]
else:
sql = "insert into car_brand (car_brand_id, car_brand_name, car_master_brand_id, car_master_brand_name) values (%s,%s,%s,%s)"
car_brand_id = getRandNum()
val = (car_brand_id, car_brand_name, car_master_brand_id,car_master_brand_name)
cursor.execute(sql, val)
sql = "select car_series_id from car_series where car_series_name = '%s'"%(car_series_name)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
car_series_id = result[0]
else:
sql = "insert into car_series (car_series_id, car_series_name, car_master_brand_id, car_master_brand_name,car_brand_id,car_brand_name) values (%s,%s,%s,%s,%s,%s)"
car_series_id = getRandNum()
val = (car_series_id, car_series_name, car_master_brand_id,car_master_brand_name,car_brand_id,car_brand_name)
cursor.execute(sql, val)
sql = "select car_year_id from car_year where car_year_name = '%s'"%(year)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
car_year_id = result[0]
else:
sql = "insert into car_year (car_year_id, car_year_name) values (%s,%s)"
car_year_id = getRandNum()
val = (car_year_id, year)
cursor.execute(sql, val)
sql = "select id from car_full_info where auto_home_id = '%d'"%(auto_home_id)
cursor.execute(sql)
result = cursor.fetchone()
if result is None:
sql = ("insert into car_full_info (first_letter,car_master_brand_id,car_master_brand_name,car_brand_id,car_brand_name,car_series_id,car_series_name,"
"car_new_name,car_emission_standard_id,car_emission_standard_name,car_year_id,car_year,car_dpm_id,car_dpm,car_structure_id,car_structure,"
"car_drive,car_tms,engine_type,car_engine,car_engine_model,market_time,car_energy_type,front_brake_type,back_brake_type,parking_brake_type,"
"front_tire_size,back_tire_size,auto_home_id,new_car) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
val = (first_letter,car_master_brand_id,car_master_brand_name,car_brand_id,car_brand_name,car_series_id,car_series_name,
car_new_name,car_emission_standard_id,car_emission_standard_name,car_year_id,car_year,car_dpm_id,car_dpm,car_structure_id,car_structure,
car_drive,car_tms,engine_type,car_engine,car_engine_model,market_time,car_energy_type,front_brake_type,back_brake_type,parking_brake_type,
front_tire_size,back_tire_size,auto_home_id,1)
cursor.execute(sql, val)
car_full_info_id = cursor.lastrowid
else:
car_full_info_id = result[0]
sql = ("update car_full_info set first_letter=%s,car_master_brand_id=%s,car_master_brand_name=%s,car_brand_id=%s,car_brand_name=%s,car_series_id=%s,car_series_name=%s,"
"car_new_name=%s,car_emission_standard_id=%s,car_emission_standard_name=%s,car_year_id=%s,car_year=%s,car_dpm_id=%s,car_dpm=%s,car_structure_id=%s,car_structure=%s,"
"car_drive=%s,car_tms=%s,engine_type=%s,car_engine=%s,car_engine_model=%s,market_time=%s,car_energy_type=%s,front_brake_type=%s,back_brake_type=%s,parking_brake_type=%s,"
"front_tire_size=%s,back_tire_size=%s where id = %s")
val = (first_letter,car_master_brand_id,car_master_brand_name,car_brand_id,car_brand_name,car_series_id,car_series_name,
car_new_name,car_emission_standard_id,car_emission_standard_name,car_year_id,car_year,car_dpm_id,car_dpm,car_structure_id,car_structure,
car_drive,car_tms,engine_type,car_engine,car_engine_model,market_time,car_energy_type,front_brake_type,back_brake_type,parking_brake_type,
front_tire_size,back_tire_size,car_full_info_id)
cursor.execute(sql, val)
create_time = time.strftime('%Y-%m-%d %H:%M:%S')
update_time = time.strftime('%Y-%m-%d %H:%M:%S')
sql = "select id from car_parameter_affiliation where auto_home_id = '%d'"%(auto_home_id)
cursor.execute(sql)
result = cursor.fetchone()
if result is None:
sql = ("insert into car_parameter_affiliation (car_full_info_id,auto_home_id,level,max_power,acceleration_0_100,wltc_oil_consumption,car_warranty,length,width,height,"
"wheelbase,front_track_width,back_track_width,car_door_num,car_seat_num,tank_capacity,back_up_capacity,curb_weight,cylinder_arrange_form,"
"cylinder_num,valve_train_num,valve_system,max_horsepower,max_power_speed,max_torque_speed,max_net_power,roz,oil_supply_method,cylinder_head_material,"
"cylinder_material,gearshift_num,front_suspension_type,back_suspension_type,assistance_type,car_body_structure,spare_tire_specifications,abs,"
"brake_force_distribution,brake_assist,traction_control,stability_control,engine_start_stop,auto_park,hill_start_assist,gps,parallel_assistance,"
"auto_park_entry,engine_anti_theft,back_cup_holder,back_outlet,tpms,create_time,update_time) "
"values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,"
"%s,%s,%s,%s,%s,%s,%s)")
val = (car_full_info_id,auto_home_id,level,max_power,acceleration_0_100,wltc_oil_consumption,car_warranty,length,width,height,
wheelbase,front_track_width,back_track_width,car_door_num,car_seat_num,tank_capacity,back_up_capacity,curb_weight,cylinder_arrange_form,
cylinder_num,valve_train_num,valve_system,max_horsepower,max_power_speed,max_torque_speed,max_net_power,roz,oil_supply_method,cylinder_head_material,
cylinder_material,gearshift_num,front_suspension_type,back_suspension_type,assistance_type,car_body_structure,spare_tire_specifications,abs,
brake_force_distribution,brake_assist,traction_control,stability_control,engine_start_stop,auto_park,hill_start_assist,gps,parallel_assistance,
auto_park_entry,engine_anti_theft,back_cup_holder,back_outlet,tpms,create_time,update_time)
cursor.execute(sql, val)
else:
sql = ("update car_parameter_affiliation set level=%s,max_power=%s,acceleration_0_100=%s,wltc_oil_consumption=%s,car_warranty=%s,length=%s,width=%s,height=%s,"
"wheelbase=%s,front_track_width=%s,back_track_width=%s,car_door_num=%s,car_seat_num=%s,tank_capacity=%s,back_up_capacity=%s,curb_weight=%s,cylinder_arrange_form=%s,"
"cylinder_num=%s,valve_train_num=%s,valve_system=%s,max_horsepower=%s,max_power_speed=%s,max_torque_speed=%s,max_net_power=%s,roz=%s,oil_supply_method=%s,cylinder_head_material=%s,"
"cylinder_material=%s,gearshift_num=%s,front_suspension_type=%s,back_suspension_type=%s,assistance_type=%s,car_body_structure=%s,spare_tire_specifications=%s,abs=%s,"
"brake_force_distribution=%s,brake_assist=%s,traction_control=%s,stability_control=%s,engine_start_stop=%s,auto_park=%s,hill_start_assist=%s,gps=%s,parallel_assistance=%s,"
"auto_park_entry=%s,engine_anti_theft=%s,back_cup_holder=%s,back_outlet=%s,tpms=%s,update_time=%s where id=%s")
val = (level,max_power,acceleration_0_100,wltc_oil_consumption,car_warranty,length,width,height,
wheelbase,front_track_width,back_track_width,car_door_num,car_seat_num,tank_capacity,back_up_capacity,curb_weight,cylinder_arrange_form,
cylinder_num,valve_train_num,valve_system,max_horsepower,max_power_speed,max_torque_speed,max_net_power,roz,oil_supply_method,cylinder_head_material,
cylinder_material,gearshift_num,front_suspension_type,back_suspension_type,assistance_type,car_body_structure,spare_tire_specifications,abs,
brake_force_distribution,brake_assist,traction_control,stability_control,engine_start_stop,auto_park,hill_start_assist,gps,parallel_assistance,
auto_park_entry,engine_anti_theft,back_cup_holder,back_outlet,tpms,update_time,result[0])
cursor.execute(sql, val)
try:
conn.commit()
except Exception:
conn.rollback()
f = open("D:\\autoHome\\异常数据\\exception.txt","a",encoding="utf-8")
content = ('car_year:' + car_year_name + 'car_structure: ' + car_structure_name + 'car_dpm: ' + car_dpm_name + 'car_emission_standard: ' + car_emission_standard_name
+ 'car_master_brand: ' + car_master_brand_name + 'car_brand: ' + car_brand_name + 'car_series: ' + car_series_name + 'car_full_info: ' + str(auto_home_id)
+ 'car_parameter_affiliation: ' + str(auto_home_id) + '字母: ' + first_letter)
f.write(content+"\n")
cursor.close()
conn.close()
if __name__ == "__main__":
wordArr = [chr(i) for i in range(ord("B"), ord("B") + 1)]
firstSite = "https://www.autohome.com.cn/grade/carhtml/"
secondSite = "https://car.autohome.com.cn/config/series/"
threeSite = "https://www.autohome.com.cn/"
for word in wordArr:
requestUrl = firstSite + word + '.html'
try:
resp = req.get(requestUrl)
except Exception:
resp = req.get(requestUrl)
try:
bs = bs4.BeautifulSoup(str(resp.content, 'gbk'), "html.parser")
except Exception as e:
bs = bs4.BeautifulSoup(str(resp.content, 'utf8'), "html.parser")
dlArr = bs.find_all('dl')
for dl in dlArr:
classA = dl.dt.div.a.text
liArr = dl.dd.find_all('li')
for li in liArr:
classB = li.parent.previous_sibling.previous_sibling.next.next
if li.h4 is not None:
yearids = []
yearRel = {}
carId = li['id'][1:]
classC = li.h4.a.text
carInfoMessage = 'var carInfo = {"classA": "' + classA + '","classB": "' + classB + '","classC": "' + classC + '","word": "' + word + '"};'
try:
resp = req.get(threeSite + carId)
except Exception:
resp = req.get(threeSite + carId)
try:
bs = bs4.BeautifulSoup(str(resp.content, 'gbk'), "html.parser")
except Exception as e:
bs = bs4.BeautifulSoup(str(resp.content, 'utf8'), "html.parser")
aArr = bs.select('.athm-sub-nav__car__year dd a')
if len(aArr) != 0:
del aArr[len(aArr)-1]
for a in aArr:
yearid = re.findall(r"\d+/(.+?)/", a.get('href'))
year = re.findall(r"\d+", a.text)
yearRel[yearid[0]] = year[0]
yearids.append(yearid[0])
print(threeSite + carId + '/sale.html')
try:
resp = req.get(threeSite + carId + '/sale.html')
except Exception:
resp = req.get(threeSite + carId + '/sale.html')
try:
bs = bs4.BeautifulSoup(str(resp.content, 'gbk'), "html.parser")
except Exception as e:
bs = bs4.BeautifulSoup(str(resp.content, 'utf8'), "html.parser")
datayearidArr = bs.select('a[data-yearid]')
if len(datayearidArr) != 0:
for datayearid in datayearidArr:
yearid = datayearid.get('data-yearid')
year = re.findall(r"\d+", datayearid.text)
yearids.append(yearid)
yearRel[yearid] = year[0]
yearids = set(yearids)
if len(yearids) != 0:
for yearid in yearids:
fileName = str(carId) + '-' + str(yearid)
secSite = secondSite + carId + '-' + yearid + '.html'
try:
resp = req.get(secSite, stream=True)
except Exception:
resp = req.get(secSite, stream=True)
if secSite != resp.url:
continue
text = str(resp.content, encoding="utf-8")
config = re.search('var config = (.*?){1,};', text)
if config is None:
print("没有参数配置的车: " + secSite)
continue
text = text + carInfoMessage
allJs = ("var rules = '2';"
"var document = {};"
"function getRules(){return rules}"
"document.createElement = function() {"
" return {"
" sheet: {"
" insertRule: function(rule, i) {"
" if (rules.length == 0) {"
" rules = rule;"
" } else {"
" rules = rules + '#' + rule;"
" }"
" }"
" }"
" }"
"};"
"document.querySelectorAll = function() {"
" return {};"
"};"
"document.head = {};"
"document.head.appendChild = function() {};"
"var window = {};"
"window.decodeURIComponent = decodeURIComponent;")
try:
js = re.findall('(\(function\([a-zA-Z]{2}.*?_\).*?\(document\);)', text)
for item in js:
allJs = allJs + item
except Exception as e:
print('makejs function exception')
newHtml = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body> <script type='text/javascript'>"
allJs = newHtml + allJs + " document.write(rules)</script></body></html>"
f = open("/home/wolf/jshtml/" + fileName + ".html", "w", encoding="utf-8")
f.truncate()
f.write(allJs)
f.close()
jsonData = ""
config = re.search('var config = (.*?){1,};', text)
if config is not None:
jsonData = jsonData + config.group(0)
option = re.search('var option = (.*?)};', text)
if option is not None:
jsonData = jsonData + option.group(0)
carInfo = re.search('var carInfo = (.*?);', text)
if carInfo is not None:
jsonData = jsonData + carInfo.group(0)
crack = Crack('牛逼', '13681489357', '136814893527')
try:
crack.browser.get("file:///home/wolf/jshtml/" + fileName + ".html" + "")
except Exception:
crack.browser.get("file:///home/wolf/jshtml/" + fileName + ".html" + "")
jshtmlcontent = crack.browser.find_element(By.TAG_NAME, 'body')
jsos = re.findall("<span(.*?)></span>", jsonData)
for js in jsos:
sea = re.search("'(.*?)'", js)
spanContent = str(sea.group(1)) + "::before { content:(.*?)}"
spanContentRe = re.search(spanContent, jshtmlcontent.text)
if spanContentRe is not None:
if sea.group(1) is not None:
jsonData = jsonData.replace(str("<span class='" + sea.group(1) + "'></span>"),
re.search("\"(.*?)\"", spanContentRe.group(1)).group(1))
jsonData = jsonData.replace("&nbsp;", "")
carItem = {}
config = "var config = (.*?);"
option = "var option = (.*?);var"
carInfo = "var carInfo = (.*?);"
configRe = re.findall(config, jsonData)
optionRe = re.findall(option, jsonData)
carInfoRe = re.findall(carInfo, jsonData)
config = json.loads(configRe[0])
option = json.loads(optionRe[0])
carInfo = json.loads(carInfoRe[0])
configItems = []
optionItems = []
for paramtypeitem in config['result']['paramtypeitems']:
configItems.append(paramtypeitem['paramitems'])
for configtypeitem in option['result']['configtypeitems']:
optionItems.append(configtypeitem['configitems'])
for configItem in configItems:
for car in configItem:
carItem[car['name']] = []
carItem['ID'] = []
carItem['一级分类'] = []
carItem['二级分类'] = []
carItem['三级分类'] = []
carItem['字母'] = []
for ca in car['valueitems']:
carItem[car['name']].append(ca['value'])
carItem['ID'].append(ca['specid'])
carItem['一级分类'].append(carInfo['classA'])
carItem['二级分类'].append(carInfo['classB'])
carItem['三级分类'].append(carInfo['classC'])
carItem['字母'].append(carInfo['word'])
for optionItem in optionItems:
for car in optionItem:
carItem[car['name']] = []
for ca in car['valueitems']:
carItem[car['name']].append(ca['value'])
length = len(carItem['ID'])
keys = carItem.keys()
for n in range(0, length):
if carItem['车型名称'][n] == '-':
continue
handleData(carItem, n, yearRel[yearid])
else:
gc.collect()
crack.browser.close()
os.remove('/home/wolf/jshtml/' + fileName + ".html")
print("结束")
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论