一键自动化获取网站信息

你也别回来了,我有颈椎病,回头有害健康.

原理

通过站长之家获取相关的信息咨询

然后使用masscan扫描开放端口

最后使用端口库获取开放端口的相关信息

实现起来并不复杂,但是昨晚上了一个夜班后感觉脑子真的不好使了,修改BUG修改到八点多才弄完,好累

获取信息

# -*- coding: utf-8 -*-
import multiprocessing
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import re
import requests
import random
import time
import masscan
import os
import struct
import socket,string,struct
from tinydb import TinyDB, where
from tinydb.storages import JSONStorage
from tinydb.middlewares import CachingMiddleware
from collections import namedtuple

Port = namedtuple("Port", ["name", "port", "protocol", "description"])

__BASE_PATH__ = os.path.dirname(os.path.abspath(__file__))
__DATABASE_PATH__ = os.path.join(__BASE_PATH__, 'ports.json')
__DB__ = TinyDB(__DATABASE_PATH__, storage=CachingMiddleware(JSONStorage))


def get_ports(port, like=False):
    """
    This function creates the SQL query depending on the specified port and
    the --like option.

    :param port: the specified port
    :param like: the --like option
    :return: all ports matching the given ``port``
    :rtype: list
    """
    where_field = "port" if port.isdigit() else "name"
    if like:
        ports = __DB__.search(where(where_field).search(port))
    else:
        ports = __DB__.search(where(where_field) == port)
    try:
        return ports[0]  # flake8: noqa (F812)
    except:
        return []

headerss = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

# url,title,weights,ip,ages,whois_id,whois_type,whois_name,whois_time
# 网址,标题,百度权重,ip信息,年龄,备案号,备案性质,备案名称,备案时间
# include_baidu,request,text,service,language
# 百度收录,,协议类型,页面类型,服务器类型,程序语言
title_parrten = 'class="w61-0"><div class="ball">(.*?)</div></td>'  # group(1) 正常
ip_parrten = '>IP:(.*?)</a></div>'  # group(1) 正常
# 下面会报错
ages = '" target="_blank">(.*?)</a></div></div>'  # group(1)
whois_id = '备案号:</span><a href=.*?" target="_blank">(.*?)</a></div>'  # 需group(1)
whois_type = '<span>性质:</span><strong>(.*?)</strong></div>'  # 需group(1)
whois_name = '<span>名称:</span><strong>(.*?)</strong></div>'  # 需group(1)
whois_time = '<span>审核时间:</span><strong>(.*?)</strong></div>'  # 需group(1)
include_baidu = '<div class="Ma01LiRow w12-1 ">(.*?)</div>'  # group(1)
infos = '<div class="MaLi03Row w180">(.*?)</div>'  # 要findall 0,1,2,3


def get_baidu_weights(url):
    x = str(random.randint(1, 9))
    data = {
        't': 'rankall',
        'on': 1,
        'type': 'baidupc',
        'callback': 'jQuery111303146901980779846_154444474116%s' % (x),
        'host': url
    }

    headers = {

        'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': 'UM_distinctid=165af67ee6f352-07238a34ed3941-9393265-1fa400-165af67ee70473; CNZZDATA5082706=cnzz_eid%3D832961605-1544438317-null%26ntime%3D1544443717; Hm_lvt_aecc9715b0f5d5f7f34fba48a3c511d6=1544443985; Hm_lpvt_aecc9715b0f5d5f7f34fba48a3c511d6=1544443985; qHistory=aHR0cDovL3JhbmsuY2hpbmF6LmNvbS9iYWlkdW1vYmlsZS8r55m+5bqm56e75Yqo5p2D6YeNfGh0dHA6Ly9yYW5rLmNoaW5hei5jb20vcmFua2FsbC8r5p2D6YeN57u85ZCI5p+l6K+ifGh0dHA6Ly9yYW5rLmNoaW5hei5jb20r55m+5bqm5p2D6YeN5p+l6K+ifGh0dHA6Ly9pbmRleC5jaGluYXouY29tLyvlhbPplK7or43lhajnvZHmjIfmlbB8aHR0cDovL3JhbmsuY2hpbmF6LmNvbS9yYW5rL2hpc3RvcnkuYXNweCvmnYPph43ljoblj7Lmn6Xor6I=',
        'Host': 'rank.chinaz.com',
        'Origin': 'http://rank.chinaz.com',
        'Referer': 'http://rank.chinaz.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'

    }
    try:
        urls = 'http://rank.chinaz.com/ajaxseo.aspx?t=rankall&on=1&type=undefined&callback=jQuery111303146901980779846_154444474116%s' % (
            x)

        r = requests.post(url=urls, headers=headers, data=data)
        try:
            res = re.search(',"br":(\d),"beforBr', r.content).group(1)
        except:
            pass
        if res:
            return res
        else:
            return '无权重'
    except:
        return '无权重'

class IPLocator:
    def __init__(self, ipdbFile):
        self.ipdb = open(ipdbFile, "rb")
        str = self.ipdb.read(8)
        (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
        self.indexCount = (self.lastIndex - self.firstIndex) / 7 + 1

    def getVersion(self):
        s = self.getIpAddr(0xffffff00L)
        return s

    def getAreaAddr(self, offset=0):
        if offset:
            self.ipdb.seek(offset)
        str = self.ipdb.read(1)
        (byte,) = struct.unpack('B', str)
        if byte == 0x01 or byte == 0x02:
            p = self.getLong3()
            if p:
                return self.getString(p)
            else:
                return ""
        else:
            self.ipdb.seek(-1, 1)
            return self.getString(offset)

    def getAddr(self, offset, ip=0):
        self.ipdb.seek(offset + 4)
        countryAddr = ""
        areaAddr = ""
        str = self.ipdb.read(1)
        (byte,) = struct.unpack('B', str)
        if byte == 0x01:
            countryOffset = self.getLong3()
            self.ipdb.seek(countryOffset)
            str = self.ipdb.read(1)
            (b,) = struct.unpack('B', str)
            if b == 0x02:
                countryAddr = self.getString(self.getLong3())
                self.ipdb.seek(countryOffset + 4)
            else:
                countryAddr = self.getString(countryOffset)
            areaAddr = self.getAreaAddr()
        elif byte == 0x02:
            countryAddr = self.getString(self.getLong3())
            areaAddr = self.getAreaAddr(offset + 8)
        else:
            countryAddr = self.getString(offset + 4)
            areaAddr = self.getAreaAddr()
        return countryAddr + " " + areaAddr

    def dump(self, first, last):
        if last > self.indexCount:
            last = self.indexCount
        for index in range(first, last):
            offset = self.firstIndex + index * 7
            self.ipdb.seek(offset)
            buf = self.ipdb.read(7)
            (ip, of1, of2) = struct.unpack("IHB", buf)
            address = self.getAddr(of1 + (of2 << 16))
            # 把GBK转为utf-8
            address = unicode(address, 'gbk').encode("utf-8")
            print "%d\t%s\t%s" % (index, self.ip2str(ip), \
                                  address)

    def setIpRange(self, index):
        offset = self.firstIndex + index * 7
        self.ipdb.seek(offset)
        buf = self.ipdb.read(7)
        (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
        self.curEndIpOffset = of1 + (of2 << 16)
        self.ipdb.seek(self.curEndIpOffset)
        buf = self.ipdb.read(4)
        (self.curEndIp,) = struct.unpack("I", buf)

    def getIpAddr(self, ip):
        L = 0
        R = self.indexCount - 1
        while L < R - 1:
            M = (L + R) / 2
            self.setIpRange(M)
            if ip == self.curStartIp:
                L = M
                break
            if ip > self.curStartIp:
                L = M
            else:
                R = M
        self.setIpRange(L)
        # version information,255.255.255.X,urgy but useful
        if ip & 0xffffff00L == 0xffffff00L:
            self.setIpRange(R)
        if self.curStartIp <= ip <= self.curEndIp:
            address = self.getAddr(self.curEndIpOffset)
            # 把GBK转为utf-8
            address = unicode(address, 'gbk').encode("utf-8")
        else:
            address = "未找到该IP的地址"
        return address

    def getIpRange(self, ip):
        self.getIpAddr(ip)
        range = self.ip2str(self.curStartIp) + ' - ' \
                + self.ip2str(self.curEndIp)
        return range

    def getString(self, offset=0):
        if offset:
            self.ipdb.seek(offset)
        str = ""
        ch = self.ipdb.read(1)
        (byte,) = struct.unpack('B', ch)
        while byte != 0:
            str = str + ch
            ch = self.ipdb.read(1)
            (byte,) = struct.unpack('B', ch)
        return str

    def ip2str(self, ip):
        return str(ip >> 24) + '.' + str((ip >> 16) & 0xffL) + '.' \
               + str((ip >> 8) & 0xffL) + '.' + str(ip & 0xffL)

    def str2ip(self, s):
        (ip,) = struct.unpack('I', socket.inet_aton(s))
        return ((ip >> 24) & 0xffL) | ((ip & 0xffL) << 24) \
               | ((ip >> 8) & 0xff00L) | ((ip & 0xff00L) << 8)

    def getLong3(self, offset=0):
        if offset:
            self.ipdb.seek(offset)
        str = self.ipdb.read(3)
        (a, b) = struct.unpack('HB', str)
        return (b << 16) + a


def get_ip_address(ip):
    IPL = IPLocator("qqwry.dat")
    address = IPL.getIpAddr(IPL.str2ip(ip))
    return address


def get_ipinfomation(lis):
    # 该函数的作用是传入一个列表
    infos_ = []
    for ip in lis:
        d = get_ports(str(ip))
        if d != []:
            infos_.append('\n端口:' + str(ip) + '\n服务:' + str(d['name']) + '\n功能:' + str(d['description']) + '\n')
        else:
            infos_.append(str(ip) + ':' + str('识别失败') + '\n')

    return infos_

def get_ips(ip):
    url_ip = ip
    url_port = []
    try:
        mas = masscan.PortScanner()
        mas.scan(url_ip)
        url_port = mas.scan_result['scan'][url_ip]['tcp'].keys()
    except:
        url_port = [80]

    if 80 in url_port:
        pass
    else:
        url_port.append(80)

    try:
        infos = {}
        infos['开放端口'] = str(url_port)
        infos['端口信息'] = str(get_ipinfomation(url_port))
        return infos
    except Exception,e:
        print e




def get_info(pattren, result):
    try:
        res = re.search(pattren, result).group(1)
        return res
    # return str(res.encode('utf-8'))
    except:
        return '暂无信息'

import socket

def iiip(url):
    try:
        return socket.gethostbyname(url.replace('https://', '').replace('http://', '').replace('/', '').replace('www.', ''))
    except:
        return '获取失败'

def scan_seo(url):
    print 'Scan : ' + url
    UA = random.choice(headerss)
    headers = {'User-Agent': UA}
    urls = 'http://seo.chinaz.com/' + url.replace('https://', '').replace('http://', '').replace('/', '').replace(
        'www.', '')
    # url,title,weights,ip,ages,whois_id,whois_type,whois_name,whois_time
    # 网址,标题,百度权重,ip信息,年龄,备案号,备案性质,备案名称,备案时间
    # include_baidu,request,text,service,language
    # 百度收录,,协议类型,页面类型,服务器类型,程序语言
    res = {}
    try:
        r = requests.get(urls, headers, timeout=20).content
    except Exception, e:
        print e
        return False
    res['百度权重'] = str(get_baidu_weights(url))
    res['网站网址'] = url
    res['网站标题'] = get_info(title_parrten, r)
    ip_infos= get_info(ip_parrten, r)
    if '[' in ip_infos:
        ip,address = ip_infos.split('[')[0],ip_infos.split('[')[1]
        ress = get_ips(ip)
        res['IP__坐标'] = address.replace(']','')
        res['所属__IP'] = ip
        res.update(ress)
    else:
        res['所属__IP'] = iiip(url)
        if res['所属__IP'] == '获取失败':
            res['IP__坐标'] = '获取失败'
            res['开放端口'] = '[80]'
            res['端口信息'] = '["获取失败"]'
        else:
            res['IP__坐标'] = get_ip_address(res['所属__IP'])
            ress = get_ips(res['所属__IP'])
            res.update(ress)

    res['网站年龄'] = get_info(ages, r)
    res['备案编号'] = get_info(whois_id, r)
    res['备案性质'] = get_info(whois_type, r)
    res['备案名称'] = get_info(whois_name, r)
    res['备案时间'] = get_info(whois_time, r)
    res['百度收录'] = get_info(include_baidu, r)

    dd = re.findall(infos, r, re.S)
    resu = ['暂无信息' if x.replace(' ', '') is '' else x for x in dd]
    try:
        res['协议类型'] = resu[0]
    except:
        res['协议类型'] = '获取失败'

    try:
        res['页面类型'] = resu[1]
    except:
        res['页面类型'] = '获取失败'

    try:
        res['服务类型'] = resu[2]
    except:
        res['服务类型'] = '获取失败'

    try:
        res['程序语言'] = resu[3]
    except:
        res['程序语言'] = '获取失败'



    return res

def run(url):
    result = scan_seo(url)
    if result == False:
        return None

    title = result['网站标题']
    try:
        with open('result/'+title.decode('utf-8') + '__InforMationReport.txt','a+')as a:
            a.write('                             【 网站信息 】 \n')
            a.write('【网站网址】 ' + result['网站网址'] + '\n')
            a.write('【网站标题】 ' + result['网站标题'] + '\n')
            a.write('【百度权重】 ' + result['百度权重'] + '\n')
            a.write('【网站年龄】 ' + result['网站年龄'] + '\n')
            a.write('【所属__IP】 ' + result['所属__IP'] + '\n')
            a.write('【IP__坐标】 ' + result['IP__坐标'] + '\n')
            a.write('【页面类型】 ' + result['页面类型'] + '\n')
            a.write('【服务类型】 ' + result['服务类型'] + '\n')
            a.write('【程序语言】 ' + result['程序语言'] + '\n')
            a.write('【开放端口】 ' + result['开放端口'] + '\n')
            a.write('【端口信息】 '  + '\n')
            ds = eval(result['端口信息'])
            for x in ds:
                a.write(x+'\n')
            a.write('【备案编号】 ' + result['备案编号'] + '\n')
            a.write('【备案性质】 ' + result['备案性质'] + '\n')
            a.write('【备案名称】 ' + result['备案名称'] + '\n')
            a.write('【备案时间】 ' + result['备案时间'] + '\n')
            a.write('【百度收录】 ' + result['百度收录'] + '\n')
    except:
        with open('result/' + url.replace('https://', '').replace('http://', '').replace('/', '').replace('www.', '')  + '__InforMationReport.txt','a+')as a:
            a.write('                             【 网站信息 】 \n')
            a.write('【网站网址】 ' + result['网站网址'] + '\n')
            a.write('【网站标题】 ' + result['网站标题'] + '\n')
            a.write('【百度权重】 ' + result['百度权重'] + '\n')
            a.write('【网站年龄】 ' + result['网站年龄'] + '\n')
            a.write('【所属__IP】 ' + result['所属__IP'] + '\n')
            a.write('【IP__坐标】 ' + result['IP__坐标'] + '\n')
            a.write('【页面类型】 ' + result['页面类型'] + '\n')
            a.write('【服务类型】 ' + result['服务类型'] + '\n')
            a.write('【程序语言】 ' + result['程序语言'] + '\n')
            a.write('【开放端口】 ' + result['开放端口'] + '\n')
            a.write('【端口信息】 '  + '\n')
            ds = eval(result['端口信息'])
            for x in ds:
                a.write(x+'\n')
            a.write('【备案编号】 ' + result['备案编号'] + '\n')
            a.write('【备案性质】 ' + result['备案性质'] + '\n')
            a.write('【备案名称】 ' + result['备案名称'] + '\n')
            a.write('【备案时间】 ' + result['备案时间'] + '\n')
            a.write('【百度收录】 ' + result['百度收录'] + '\n')



if __name__ == '__main__':
    if os.path.exists('result'):
        pass
    else:
        os.mkdir('result')

    print ('''
             _                           _
            | |                         (_)
            | |     __ _ _ __   __ _ _____
            | |    / _` | '_ \ / _` |_  / |
            | |___| (_| | | | | (_| |/ /| |
            |______\__,_|_| |_|\__, /___|_|
                                __/ |
                               |___/

    ''')

    time.sleep(1)
    print unicode('     LangZi 一键信息综合查询', 'utf-8')
    time.sleep(1)
    New_start = raw_input(unicode('导入网址文本(可拖拽):', 'utf-8').encode('gbk'))  # line:190
    New_start = New_start.replace('"', '').replace("'", '')
    list_ = list(set(
        [x.replace('\n', '') if x.startswith('http') else 'http://' + x.replace('\n', '') for x in
         open(New_start, 'r').readlines()]))
    for u in list_:
        run(u)
        time.sleep(random.randint(1,5))
#

功能

导入网址

自动生成信息

坚持原创技术分享,您的支持将鼓励我继续创作!
------ 本文结束 ------

版权声明

LangZi_Blog's by Jy Xie is licensed under a Creative Commons BY-NC-ND 4.0 International License
由浪子LangZi创作并维护的Langzi_Blog's博客采用创作共用保留署名-非商业-禁止演绎4.0国际许可证
本文首发于Langzi_Blog's 博客( http://langzi.fun ),版权所有,侵权必究。

0%