企业项目管理、ORK、研发管理与敏捷开发工具平台

网站首页 > 精选文章 正文

python原始套接字socket下载https网页文件到txt

wudianyun 2025-05-08 21:39:18 精选文章 13 ℃

python原始套接字socket下载https网页文件到txt

import socket
import ssl


def download_https_webpage(url, output_file):
    try:
        # 解析 URL
        if url.startswith("https://"):
            url = url[8:]
        host = url.split("/")[0]
        path = "/" + "/".join(url.split("/")[1:])

        # 创建 socket 对象
        client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

        # 获取主机的 IP 地址
        ip = socket.gethostbyname(host)

        # 使用 ssl 模块将 socket 包装为安全连接
        context = ssl.create_default_context()
        client_socket = context.wrap_socket(client_socket, server_hostname=host)

        # 连接到服务器
        client_socket.connect((ip, 443))

        # 构建 HTTP 请求
        request = f"GET {path} HTTP/1.1\r\nHost: {host}\r\nConnection: close\r\n\r\n"
        client_socket.sendall(request.encode())

        # 接收响应
        response = b""
        while True:
            data = client_socket.recv(4096)
            if not data:
                break
            response += data

        # 关闭 socket
        client_socket.close()

        # 分离 HTTP 头部和内容
        header, content = response.split(b"\r\n\r\n", 1)

        # 保存内容到文件
        with open(output_file, "wb") as file:
            file.write(content)

        print(f"网页内容已成功保存到 {output_file}")
    except Exception as e:
        print(f"发生错误: {e}")


if __name__ == "__main__":
    url = "https://www.5a8.com"  # 替换为你要下载的网页 URL
    output_file = "www5a8com.txt"
    download_https_webpage(url, output_file)
    

运行结果

D:\code\python\get>python getsocketssl.py
网页内容已成功保存到 www5a8com.txt
最近发表
标签列表