python爬取网页源代码，提取关键词信息

伊渃土鸡蛋 · 发表于 2024-9-8 17:36:54

fromseleniumimportwebdriverfromselenium.webdriver.chrome.serviceimportServicefromselenium.webdriver.chrome.optionsimportOptionsfromselenium.webdriver.support.uiimportWebDriverWaitfromselenium.webdriver.supportimportexpected_conditionsasECfromselenium.webdriver.common.byimportByimporttimeimportcsvfromtqdmimporttqdm#设置Chrome选项，使其在无头模式下运行chrome_options=Options()chrome_options.add_argument('--headless')chrome_options.add_argument('--disable-gpu')chrome_options.add_argument('--no-sandbox')#禁止加载图片prefs={"profile.managed_default_content_settings.images":2}chrome_options.add_experimental_option("prefs",prefs)#CSV文件名csv_file='lists_3.csv'#指定ChromeDriver的路径driver_path='/usr/bin/chromedriver'#创建一个Service对象service=Service(driver_path)#创建WebDriver实例driver=webdriver.Chrome(service=service,options=chrome_options)#从CSV文件读取URL列表urls=[]withopen(csv_file,mode='r',encoding='utf-8')asfile:csv_reader=csv.reader(file)forrowincsv_reader:ifrow:#确保行不是空的urls.append(row[0])#假设URL在每行的第一个元素#输出文件output_file='code.txt'#使用'a'模式打开输出文件withopen(output_file,'a',encoding='utf-8')asfile:forurlintqdm(urls,desc="ProcessingURLs"):try:#访问URLdriver.get(url)#显式等待页面出现WebDriverWait(driver,10).until(EC.presence_of_element_located((By.TAG_NAME,"title")))#获取页面title=driver.title#尝试获取

		自动登录	找回密码
密码			会员注册