Python判断网页编码

有一种渴,只有酒才能滋润,这种渴就是孤独。

根据网页返回编码寻找数据

比如我要找到这个网页的标题,那么直接正则匹配(.*?)就可以,但是许多时候因为编码问题requests这个库没办法正确解析,所以获取不到数据。

解决办法:

r_port_top = requests.get(url=str('http://'+url), headers=headers, timeout=5)
if r_port_top.encoding == 'ISO-8859-1':
    encodings = requests.utils.get_encodings_from_content(r_port_top.text)
    if encodings:
        encoding = encodings[0]
    else:
        encoding = r_port_top.apparent_encoding
    encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
    port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
                                                                                         '').replace(
        '</title>', '')

这种办法就是先判断网页的编码,然后转换之。但是有的时候是utf-8编码就没办法,接下来来个终极版的。

try:
    UA = random.choice(headerss)
    headers = {'User-Agent': UA}
    r_port_top = requests.get(url=str('http://'+url), headers=headers, timeout=5)
    if r_port_top.encoding == 'ISO-8859-1':
        encodings = requests.utils.get_encodings_from_content(r_port_top.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = r_port_top.apparent_encoding
        encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
        port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
                                                                                             '').replace(
            '</title>', '')
    elif r_port_top.encoding == 'GB2312':
        encodings = requests.utils.get_encodings_from_content(r_port_top.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = r_port_top.apparent_encoding
        encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
        port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
                                                                                             '').replace(
            '</title>', '')
    elif r_port_top.encoding == 'gb2312':
        encodings = requests.utils.get_encodings_from_content(r_port_top.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = r_port_top.apparent_encoding
        encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
        port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
                                                                                             '').replace(
            '</title>', '')
    elif r_port_top.encoding == 'GBK':
        encodings = requests.utils.get_encodings_from_content(r_port_top.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = r_port_top.apparent_encoding
        encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
        port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
                                                                                             '').replace(
            '</title>', '')
    elif r_port_top.encoding == 'gbk':
        encodings = requests.utils.get_encodings_from_content(r_port_top.text)
        if encodings:
            encoding = encodings[0]
        else:
            encoding = r_port_top.apparent_encoding
        encode_content = r_port_top.content.decode(encoding, 'replace').encode('utf-8', 'replace')
        port_title = re.search('<title>(.*?)</title>', encode_content, re.S).group().replace('<title>',
                                                                                             '').replace(
            '</title>', '')
    else:
        port_title = re.search('<title>(.*?)</title>', r_port_top.content, re.S).group().replace('<title>',
                                                                                                 '').replace(
            '</title>', '')
except:
    try:
        port_title = re.search('<title>(.*?)</title>', r_port_top.content, re.S).group().replace('<title>',
                                                                                                 '').replace(
            '</title>', '')
    except:
        port_title = '暂时无法获取网站标题'

使用chardet直接判断转换

上面那个方法实在是太傻了,使用chardet轻松解决网页编码问题。

# -*- coding: utf-8 -*-
# @Time    : 2018/5/4 0004 8:55
# @Author  : Langzi
# @Blog    : www.langzi.fun
# @File    : get urls.py
# @Software: PyCharm
import sys
import chardet
import re
import requests

reload(sys)
sys.setdefaultencoding('utf-8')

url = 'https://stackoverflow.com'
d1 = requests.get(url)
print d1.content
if isinstance(d1.content,unicode):
    pass
else:
    codesty = chardet.detect(d1.content)
    a = d1.content.decode(codesty['encoding'])

得到的a就是网页最终编码后的结果,这个时候直接re.search(‘(.*?)‘,a)就可以达到了匹配所有网址的标题了。

当然更简单的方式

requests自带的一个api可以快速识别网页编码,然后转换成utf-8编码

import requests
url = 'http://www.langzi.fun'
r = requests.get(url)
encoing = requests.utils.get_encodings_from_content(r.text)[0]
print(encoing)
res = r.content.decode(encoing,'replace')
#  替换其中异常的编码,这个相对来可能一眼就知道那些字符编码出问题了。
res = r.content.decode(encoing,'ignore')
# 忽略其中有异常的编码,仅显示有效的编码

通过查看该api的源码,得知它实现的原理是用正则表达式获取到网页中的编码

def get_encodings_from_content(content):
    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

    return (charset_re.findall(content) +
            pragma_re.findall(content) +
            xml_re.findall(content))

知道原理后,就可以把这个函数拿来移植到自己的功能函数中,也是学到了噢~

获取网页信息

使用chardet库来进行编码判断

如果想要获取网页的标题和内容以及网页中的外链,写了一个类来实现。使用方法如下

d = Get_Info(url='http://www.langzi.fun')
d1 = d.get_urls()
# 返回这个传入网址中所有的外链,返回对象为列表,如果没有数据返回None,下同
d2 = d.get_infos()
# 返回这个网址中的标题,内容,返回对象为字典
d3 = d.get_ips()
# 返回这个网址的ip和开放端口,返回对象为字典

具体代码如下:

# coding:utf-8

import re
import requests
import time
import socket
from bs4 import BeautifulSoup as bs
import chardet
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
timeout = 3
socket.setdefaulttimeout(timeout)
from requests.packages import urllib3
urllib3.disable_warnings()
ports = [
    21,
    22,
    23,
    25,
    53,
    69,
    139,
    445,
    389,
    1433,
    1521,
    2181,
    3306,
    3389,
    5432,
    5984,
    6379,
    7001,
    7002,
    8069,
    11211,
    27017,
    27018,
    50070,
    50030
]


class Get_Info:
    def __init__(self,url):
        self.url = url


    def get_ips(self):
        url_port = []
        url_port.append(80)
        hostname = self.url.replace('http://','').replace('https://','').replace('/','')
        url_ip = 'None'
        try:
            url_ip= socket.gethostbyname(str(hostname))
        except:
            pass
        if url_ip and url_ip!= 'None':
            for port in ports:
                s = socket.socket()
                try:
                    s.connect((url_ip,port))
                    url_port.append(port)
                except Exception,e:
                    # print e
                    pass
                finally:
                    s.close()
        if url_ip and url_ip != 'None':
            infos = {}
            infos['ip'] = str(url_ip)
            infos['ports'] = str(url_port)
            return infos
        else:
            return None

    def get_infos(self):
        try:
            headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
            r = requests.get(url=self.url,headers=headers,verify=False,timeout=5)
            url_title,url_content,url_service = '获取失败','获取失败','获取失败'
            try:
                code = chardet.detect(r.content)['encoding']
                bp = bs(r.content.decode(code).encode('utf-8'),'html.parser')
                url_title = bp.title.string
                url_content = bp.text
                url_service = r.headers
            except:
                url_title = re.search('<title>(.*?)</title>',r.content,re.I).group(1).decode(code).encode('utf-8')
                url_content = re.sub('([\.\?\*~!@#{$%\^&\*()-;"<>\[\]}_\+=]|[0-9]|[a-z]|[A-Z])','',r.text)
                url_service = r.headers
            infos = {}
            infos['url'] = r.url
            infos['title'] = url_title
            url_contents = ''.join(r.text.split()).replace(' ','')
            infos['content'] = re.sub('([\.\?\*~!@#{$%\^&\*()-;"<>\[\]}_\+=]|[0-9]|[a-z]|[A-Z])','',url_contents).replace('|','').replace("'",'')
            infos['service'] = url_service
            if infos:
                return infos
            else:
                return None
        except Exception,e:
            print e


    def get_urls(self):
        urlss = []
        headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
        try:
            r = requests.get(url=self.url, headers=headers, verify=False, timeout=5)
            pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.I)
            urls = re.findall(pattern,r.content)
            for x in urls:
                a1, a2 = x.split('//')[0], x.split('//')[1].split('/')[0]
                a3 = ''.join(a1) + '//' + ''.join(a2)
                urlss.append(a3.replace("'","").replace('>','').replace('<',''))
            if urlss:
                return list(set(urlss))
            else:
                return None
        except Exception,e:
            print e
            pass
坚持原创技术分享,您的支持将鼓励我继续创作!
------ 本文结束 ------

版权声明

LangZi_Blog's by Jy Xie is licensed under a Creative Commons BY-NC-ND 4.0 International License
由浪子LangZi创作并维护的Langzi_Blog's博客采用创作共用保留署名-非商业-禁止演绎4.0国际许可证
本文首发于Langzi_Blog's 博客( http://langzi.fun ),版权所有,侵权必究。

0%