Tommonkey

All greatness comes from a brave beginning

0%

爬取谷歌搜索结果第一条链接

src信息收集时常也会通过浏览器去搜索相关目标域名等信息,一两个我们还可以通过手动,成千上万个呢,所以写了这个小demo。把我想要查的数千个关键字,比如公司名称等放入当前目录target.txt下,然后运行脚本即可解放双手,坐享其成。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: UTF-8 -*-

import requests
import time
import random
from lxml import etree
import urllib3
import threading


banner = """
V1.0.1
___ _ __ _
/ _ \___ ___ __ _| | ___ / _\ ___ __ _ _ __ ___| |__
/ /_\/ _ \ / _ \ / _` | |/ _ \ \ \ / _ \/ _` | '__/ __| '_ \
/ /_\\ (_) | (_) | (_| | | __/ _\ \ __/ (_| | | | (__| | | |
\____/\___/ \___/ \__, |_|\___| \__/\___|\__,_|_| \___|_| |_|
|___/
Tommonkey
"""


# setting proxy
proxies = {
'http': 'http://localhost:7890',
'https': 'http://localhost:7890'
}

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
}


def input_data():
List = []
with open(r"./target.txt", encoding="utf=8") as f:
for u in f.readlines():
u = u.strip("\n")
List.append(u)
# print(List)
return List


# send requests
def requestPackage(i):
try:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
googleUrl = "https://www.google.com/search?q="
googleUrl = googleUrl + i
data_Raw = requests.get(url=googleUrl, headers=headers, proxies=proxies, timeout=20, verify=False)
data_Raw.close()
time.sleep(random.randint(2, 5))
print("[+] 正在搜索{}".format(i))
if data_Raw.status_code == 200:
print("[+] 回包状态值:{}".format(data_Raw.status_code))
data_text = data_Raw.text
# core deal string
result = etree.HTML(data_text).xpath('//*[@class="tjvcx GvPZzd cHaqb"]/text()')
if len(result) != 0:
if "edu" in result[0]:
print("[+]" + result[0])
with open("result.txt", mode="a+") as fd:
fd.write( i+ ":" + result[0] + "\n")
else:
print("[+]" + result[0])
with open("fail.txt", mode="a+") as fd:
fd.write(result[0] + ":" + i + "\n")
return 0
else:
return 1
except OSError:
pass
return 1


def queryData(list):
try:
for i in list:
status = requestPackage(i)
if status == 0:
pass
else:
for num in [1,2,3]:
print("[+] 发送请求失败,正在重试,重试次数:{}/3".format(num))
statusT = requestPackage(i)
if statusT == 1:
if num == 3:
print("[+] 抓取失败,写入NoCapture.txt")
with open("NoCapture.txt", mode="a+") as fd:
fd.write(i + "\n")
continue
else:
break

except Exception as err:
print(err)


if __name__ == "__main__":
print(banner)
list = input_data()
result = queryData(list)
print(result)

将需要查询的关键字批量放入当前目录下的target.txt。直接运行该脚本即可。

  • github地址
    1
    https://github.com/tonmonkey/googleFirstWeb
奖励作者买杯可乐?