jumploop · jumploop · Oct 26, 2023 · Jul 31, 2023 · Jul 31, 2023 · Aug 2, 2023
diff --git a/bigdata/.gitignore b/bigdata/.gitignore
@@ -0,0 +1 @@
+*.txt
diff --git a/bigdata/handlefile.py b/bigdata/handlefile.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+
+"""
+大数据处理，用来生成文件，分解文件等做测试。
+
+1. 分解文件有一个库：
+
+https://github.com/ram-jayapalan/filesplit
+
+https://www.bswen.com/2018/04/python-How-to-generate-random-large-file-using-python.html
+
+
+2. linux 文件命令 split 拆分文件
+
+"""
+import os
+import itertools
+import time
+import random
+import string
+
+
+def gen_bin():
+    # GB1 = 1024*1024*1024 # 1GB
+    # size = 50 # desired size in GB
+    GB1 = 1024
+    size = 5  # desired size in GB
+    with open('large_file.txt', 'wb') as fout:
+        for _ in range(size + 1):
+            fout.write(os.urandom(GB1))
+
+
+def gen_random_ascii(filename="bla.txt"):
+    t0 = time.time()
+    open(filename, "wb").write(''.join(random.choice(
+        string.ascii_lowercase) for i in range(10**7)))
+    d = time.time() - t0
+    print("duration: %.2f s." % d)
+
+
+def generate_big_random_bin_file(filename, size):
+    """
+    generate big binary file with the specified size in bytes
+    :param filename: the filename
+    :param size: the size in bytes
+    :return:void
+    """
+    import os
+    with open('%s' % filename, 'wb') as fout:
+        fout.write(os.urandom(size))  # 1
+
+    print('big random binary file with size %f generated ok' % size)
+    pass
+
+
+def generate_big_random_letters(filename, size):
+    """
+    generate big random letters/alphabets to a file
+    :param filename: the filename
+    :param size: the size in bytes
+    :return: void
+    """
+    chars = ''.join([random.choice(string.ascii_letters)
+                    for i in range(size)])  # 1
+
+    with open(filename, 'w') as f:
+        f.write(chars)
+
+
+def generate_big_sparse_file(filename, size):
+    f = open(filename, "wb")
+    f.seek(size - 1)
+    f.write("\1")
+    f.close()
+    pass
+
+
+def generate_big_random_sentences(filename, linecount):
+    nouns = ("puppy", "car", "rabbit", "girl", "monkey")
+    verbs = ("runs", "hits", "jumps", "drives", "barfs")
+    adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
+    adj = ("adorable", "clueless", "dirty", "odd", "stupid")
+
+    all = [nouns, verbs, adj, adv]
+
+    with open(filename, 'w') as f:
+        for i in range(linecount):
+            f.writelines([' '.join([random.choice(i) for i in all]), '\n'])
+
+
+def generate_big_incr_digits(filename, start, step, size):
+    i = itertools.count(start, step)
+    nums = '\n'.join(
+        (str(next(i)) for _ in range(size))
+    )  # 1
+
+    with open(filename, 'w') as f:
+        f.write(nums)
+
+
+def generate_big_random_digits(filename, start, end, size):
+    nums = '\n'.join(
+        (str(random.randint(start, end)) for _ in range(size))
+    )  # 1
+
+    with open(filename, 'w') as f:
+        f.write(nums)
+
+
+def split(filename):
+    NUM_OF_LINES = 100000
+    with open(filename) as fin:
+        fout = open("output0.txt", "wb")
+        for i, line in enumerate(fin):
+            fout.write(line.encode())
+            if (i+1) % NUM_OF_LINES == 0:
+                fout.close()
+                fout = open("output%d.txt" % (i/NUM_OF_LINES+1), "wb")
+
+        fout.close()
+
+
+"""
+# TODO 如何合并大文件
+
+1. cat 命令合并:
+cat file1 file2 file3 > bigfile
+cat file1 file2 file3 | sqlite database
+
+2. 使用 python 块读取然后追加写入
+
+https://stackoverflow.com/questions/5509872/python-append-multiple-files-in-given-order-to-one-big-file
+
+def append_file_to_file(_from, _to):
+    block_size = 1024*1024
+    with open(_to, "ab") as outfile, open(_from, "rb") as infile:
+        while True:
+            input_block = infile.read(block_size)
+            if not input_block:
+                break
+            outfile.write(input_block)
+# Given this building block, you can use:
+
+for filename in ['a.bin','b.bin','c.bin']:
+    append_file_to_file(filename, 'outfile.bin')
+
+3. use dask. https://rcpedia.stanford.edu/topicGuides/merging_data_sets_dask.html
+"""
+
+
+def main():
+    # 生成一个大文件
+    filename = "nums.txt"
+    # generate_big_random_digits(filename, 1, 1000, 1024 * 1024)
+    generate_big_incr_digits(filename, 1, 1, 1024 * 1024)
+    # 分割大文件，用于一些比较大数据处理的问题
+    split(filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/calibre/.gitignore b/calibre/.gitignore
@@ -0,0 +1,3 @@
+*.mobi
+*.pdf
+*.epub
diff --git a/calibre/10x程序员工作法.recipe b/calibre/10x程序员工作法.recipe
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+# encoding: utf-8
+
+from calibre.web.feeds.recipes import BasicNewsRecipe  # 引入 Recipe 基础类
+
+"""
+教程：
+- https://bookfere.com/tools#calibre
+- https://www.jianshu.com/p/0bcb92509309
+- https://snowdreams1006.github.io/myGitbook/advance/export.html
+
+命令：
+ebook-convert 10x程序员工作法.recipe 10x程序员工作法.epub
+"""
+
+
+class Blog(BasicNewsRecipe):  # 继承 BasicNewsRecipe 类的新类名
+
+    # ///////////////////
+    # 设置电子书元数据
+    # ///////////////////
+    title = "10x程序员工作法"  # 电子书名
+    description = u"10x程序员工作法"  # 电子书简介
+    # cover_url = '' # 电子书封面
+    # masthead_url = '' # 页头图片
+    __author__ = "web"  # 作者
+    language = "zh"  # 语言
+    encoding = "utf-8"  # 编码
+
+    # ///////////////////
+    # 抓取页面内容设置
+    # ///////////////////
+    # keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
+    no_stylesheets = True  # 去除 CSS 样式
+    remove_javascript = True  # 去除 JavaScript 脚本
+    auto_cleanup = True  # 自动清理 HTML 代码
+    delay = 10  # 抓取页面间隔秒数
+    max_articles_per_feed = 300  # 抓取文章数量
+    timeout = 120.0
+    simultaneous_downloads = 2  # 有时候多个同时下载会失败，修改小点或者单线程下载
+
+    # 页面内容解析方法
+    def parse_index(self):
+        site="https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/10x%E7%A8%8B%E5%BA%8F%E5%91%98%E5%B7%A5%E4%BD%9C%E6%B3%95"
+        soup = self.index_to_soup(site)
+        divtag = soup.findAll("div", {"class": "book-post"})[0]
+        links = divtag.findAll("li")
+        articles = []
+        base = "http://learn.lianglianglee.com"
+        for link in links:
+            title = link.a.contents[0].strip()
+            url = base + '/' + link.a.get("href").lstrip('/')
+            print(url)
+            a = {'title': title, 'url': url}
+            articles.append(a)
+        ans = [(self.title, articles)]
+        return ans  # 返回可供 Calibre 转换的数据结构
diff --git a/calibre/22讲通关Go语言.recipe b/calibre/22讲通关Go语言.recipe
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+# encoding: utf-8
+
+from calibre.web.feeds.recipes import BasicNewsRecipe  # 引入 Recipe 基础类
+
+"""
+教程：
+- https://bookfere.com/tools#calibre
+- https://www.jianshu.com/p/0bcb92509309
+- https://snowdreams1006.github.io/myGitbook/advance/export.html
+
+命令：
+ebook-convert 22讲通关Go语言.recipe 22讲通关Go语言.epub
+"""
+
+
+class Blog(BasicNewsRecipe):  # 继承 BasicNewsRecipe 类的新类名
+
+    # ///////////////////
+    # 设置电子书元数据
+    # ///////////////////
+    title = "22讲通关Go语言"  # 电子书名
+    description = u"22讲通关Go语言"  # 电子书简介
+    # cover_url = '' # 电子书封面
+    # masthead_url = '' # 页头图片
+    __author__ = "web"  # 作者
+    language = "zh"  # 语言
+    encoding = "utf-8"  # 编码
+
+    # ///////////////////
+    # 抓取页面内容设置
+    # ///////////////////
+    # keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
+    no_stylesheets = True  # 去除 CSS 样式
+    remove_javascript = True  # 去除 JavaScript 脚本
+    auto_cleanup = True  # 自动清理 HTML 代码
+    # delay = 2  # 抓取页面间隔秒数
+    max_articles_per_feed = 300  # 抓取文章数量
+    timeout = 120.0
+    simultaneous_downloads = 2  # 有时候多个同时下载会失败，修改小点或者单线程下载
+
+    # 页面内容解析方法
+    def parse_index(self):
+        site = "https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/22%20%E8%AE%B2%E9%80%9A%E5%85%B3%20Go%20%E8%AF%AD%E8%A8%80-%E5%AE%8C"
+        soup = self.index_to_soup(site)
+        divtag = soup.findAll("div", {"class": "book-post"})[0]
+        links = divtag.findAll("li")
+        articles = []
+        base = "http://learn.lianglianglee.com"
+        for link in links:
+            title = link.a.contents[0].strip()
+            url = base + '/' + link.a.get("href").lstrip('/')
+            a = {'title': title, 'url': url}
+            articles.append(a)
+        ans = [(self.title, articles)]
+        return ans  # 返回可供 Calibre 转换的数据结构
diff --git a/calibre/300分钟吃透分布式缓存.recipe b/calibre/300分钟吃透分布式缓存.recipe
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+# encoding: utf-8
+
+from calibre.web.feeds.recipes import BasicNewsRecipe  # 引入 Recipe 基础类
+
+"""
+教程：
+- https://bookfere.com/tools#calibre
+- https://www.jianshu.com/p/0bcb92509309
+- https://snowdreams1006.github.io/myGitbook/advance/export.html
+
+命令：
+ebook-convert 300分钟吃透分布式缓存.recipe 300分钟吃透分布式缓存.epub
+"""
+
+
+class Blog(BasicNewsRecipe):  # 继承 BasicNewsRecipe 类的新类名
+
+    # ///////////////////
+    # 设置电子书元数据
+    # ///////////////////
+    title = "300分钟吃透分布式缓存"  # 电子书名
+    description = u"300分钟吃透分布式缓存"  # 电子书简介
+    # cover_url = '' # 电子书封面
+    # masthead_url = '' # 页头图片
+    __author__ = "web"  # 作者
+    language = "zh"  # 语言
+    encoding = "utf-8"  # 编码
+
+    # ///////////////////
+    # 抓取页面内容设置
+    # ///////////////////
+    # keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
+    no_stylesheets = True  # 去除 CSS 样式
+    remove_javascript = True  # 去除 JavaScript 脚本
+    auto_cleanup = True  # 自动清理 HTML 代码
+    # delay = 2  # 抓取页面间隔秒数
+    max_articles_per_feed = 300  # 抓取文章数量
+    timeout = 120.0
+    simultaneous_downloads = 2  # 有时候多个同时下载会失败，修改小点或者单线程下载
+
+    # 页面内容解析方法
+    def parse_index(self):
+        site= "https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/300%E5%88%86%E9%92%9F%E5%90%83%E9%80%8F%E5%88%86%E5%B8%83%E5%BC%8F%E7%BC%93%E5%AD%98-%E5%AE%8C"
+        soup = self.index_to_soup(site)
+        divtag = soup.findAll("div", {"class": "book-post"})[0]
+        links = divtag.findAll("li")
+        articles = []
+        base = "http://learn.lianglianglee.com"
+        for link in links:
+            title = link.a.contents[0].strip()
+            url = base + '/' + link.a.get("href").lstrip('/')
+            a = {'title': title, 'url': url}
+            articles.append(a)
+        ans = [(self.title, articles)]
+        return ans  # 返回可供 Calibre 转换的数据结构
diff --git a/calibre/DDD实战课.recipe b/calibre/DDD实战课.recipe
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+# encoding: utf-8
+
+from calibre.web.feeds.recipes import BasicNewsRecipe  # 引入 Recipe 基础类
+
+"""
+教程：
+- https://bookfere.com/tools#calibre
+- https://www.jianshu.com/p/0bcb92509309
+- https://snowdreams1006.github.io/myGitbook/advance/export.html
+
+命令：
+ebook-convert DDD实战课.recipe DDD实战课.epub
+"""
+
+
+class Blog(BasicNewsRecipe):  # 继承 BasicNewsRecipe 类的新类名
+
+    # ///////////////////
+    # 设置电子书元数据
+    # ///////////////////
+    title = "DDD实战课"  # 电子书名
+    description = u"DDD实战课"  # 电子书简介
+    # cover_url = '' # 电子书封面
+    # masthead_url = '' # 页头图片
+    __author__ = "web"  # 作者
+    language = "zh"  # 语言
+    encoding = "utf-8"  # 编码
+
+    # ///////////////////
+    # 抓取页面内容设置
+    # ///////////////////
+    # keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
+    no_stylesheets = True  # 去除 CSS 样式
+    remove_javascript = True  # 去除 JavaScript 脚本
+    auto_cleanup = True  # 自动清理 HTML 代码
+    # delay = 2  # 抓取页面间隔秒数
+    max_articles_per_feed = 300  # 抓取文章数量
+    timeout = 120.0
+    simultaneous_downloads = 2  # 有时候多个同时下载会失败，修改小点或者单线程下载
+
+    # 页面内容解析方法
+    def parse_index(self):
+        site = "https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/DDD%E5%AE%9E%E6%88%98%E8%AF%BE"
+        soup = self.index_to_soup(site)
+        divtag = soup.findAll("div", {"class": "book-post"})[0]
+        links = divtag.findAll("li")
+        articles = []
+        base = "http://learn.lianglianglee.com"
+        for link in links:
+            title = link.a.contents[0].strip()
+            url = base + '/' + link.a.get("href").lstrip('/')
+            a = {'title': title, 'url': url}
+            articles.append(a)
+        ans = [(self.title, articles)]
+        return ans  # 返回可供 Calibre 转换的数据结构
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    *.mobi
+    *.pdf
+    *.epub