Skip to content
Merged

Dev #17

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
962b00a
Build a simple anonymous rsync server
jumploop Jul 31, 2023
a6ba4c9
Build a simple anonymous rsync server
jumploop Jul 31, 2023
0bff58b
modify rsync shell
jumploop Aug 2, 2023
6c8d1f5
lottery procedure
jumploop Aug 8, 2023
60ae3d7
解密字符串
jumploop Aug 22, 2023
191bec8
check url
jumploop Aug 29, 2023
33d6abc
word sort
jumploop Sep 7, 2023
498f293
File change monitoring
jumploop Sep 17, 2023
c91f662
modify watching
jumploop Sep 17, 2023
9edd3fe
pyinotify simple demo
jumploop Sep 17, 2023
0daeabf
modify watching.py
jumploop Sep 17, 2023
08a8055
add new file
jumploop Sep 17, 2023
42fb8e7
modify simple.py
jumploop Sep 17, 2023
4041f40
update watching.py
jumploop Sep 18, 2023
e9f8070
test execute time
jumploop Sep 23, 2023
ad7c57d
Caching results
jumploop Sep 23, 2023
c3ef8db
valid data
jumploop Sep 24, 2023
60a4317
log results
jumploop Sep 24, 2023
eecce5b
suppress errors
jumploop Sep 24, 2023
57e885d
validate output
jumploop Sep 24, 2023
f876da6
Retry execution
jumploop Sep 28, 2023
2ca65c1
visualize results
jumploop Sep 28, 2023
5f33e55
visualize results
jumploop Sep 28, 2023
2791351
debug
jumploop Sep 28, 2023
b929bf7
deprecated function
jumploop Sep 28, 2023
05a56b6
bin to dump
jumploop Oct 14, 2023
f6bef99
output square
jumploop Oct 17, 2023
e10ad65
output square
jumploop Oct 17, 2023
4480cff
output trapezoid
jumploop Oct 17, 2023
09dcb26
output trapezoid
jumploop Oct 17, 2023
f007f3a
add .sh suffix
jumploop Oct 19, 2023
6024631
add .sh suffix
jumploop Oct 19, 2023
516cb27
Isosceles trapezoid
jumploop Oct 19, 2023
7c9c271
Merge branch 'PegasusWang:master' into dev
jumploop Oct 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bigdata/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.txt
161 changes: 161 additions & 0 deletions bigdata/handlefile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-

"""
大数据处理,用来生成文件,分解文件等做测试。

1. 分解文件有一个库:

https://github.com/ram-jayapalan/filesplit

https://www.bswen.com/2018/04/python-How-to-generate-random-large-file-using-python.html


2. linux 文件命令 split 拆分文件

"""
import os
import itertools
import time
import random
import string


def gen_bin():
# GB1 = 1024*1024*1024 # 1GB
# size = 50 # desired size in GB
GB1 = 1024
size = 5 # desired size in GB
with open('large_file.txt', 'wb') as fout:
for _ in range(size + 1):
fout.write(os.urandom(GB1))


def gen_random_ascii(filename="bla.txt"):
t0 = time.time()
open(filename, "wb").write(''.join(random.choice(
string.ascii_lowercase) for i in range(10**7)))
d = time.time() - t0
print("duration: %.2f s." % d)


def generate_big_random_bin_file(filename, size):
"""
generate big binary file with the specified size in bytes
:param filename: the filename
:param size: the size in bytes
:return:void
"""
import os
with open('%s' % filename, 'wb') as fout:
fout.write(os.urandom(size)) # 1

print('big random binary file with size %f generated ok' % size)
pass


def generate_big_random_letters(filename, size):
"""
generate big random letters/alphabets to a file
:param filename: the filename
:param size: the size in bytes
:return: void
"""
chars = ''.join([random.choice(string.ascii_letters)
for i in range(size)]) # 1

with open(filename, 'w') as f:
f.write(chars)


def generate_big_sparse_file(filename, size):
f = open(filename, "wb")
f.seek(size - 1)
f.write("\1")
f.close()
pass


def generate_big_random_sentences(filename, linecount):
nouns = ("puppy", "car", "rabbit", "girl", "monkey")
verbs = ("runs", "hits", "jumps", "drives", "barfs")
adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.")
adj = ("adorable", "clueless", "dirty", "odd", "stupid")

all = [nouns, verbs, adj, adv]

with open(filename, 'w') as f:
for i in range(linecount):
f.writelines([' '.join([random.choice(i) for i in all]), '\n'])


def generate_big_incr_digits(filename, start, step, size):
i = itertools.count(start, step)
nums = '\n'.join(
(str(next(i)) for _ in range(size))
) # 1

with open(filename, 'w') as f:
f.write(nums)


def generate_big_random_digits(filename, start, end, size):
nums = '\n'.join(
(str(random.randint(start, end)) for _ in range(size))
) # 1

with open(filename, 'w') as f:
f.write(nums)


def split(filename):
NUM_OF_LINES = 100000
with open(filename) as fin:
fout = open("output0.txt", "wb")
for i, line in enumerate(fin):
fout.write(line.encode())
if (i+1) % NUM_OF_LINES == 0:
fout.close()
fout = open("output%d.txt" % (i/NUM_OF_LINES+1), "wb")

fout.close()


"""
# TODO 如何合并大文件

1. cat 命令合并:
cat file1 file2 file3 > bigfile
cat file1 file2 file3 | sqlite database

2. 使用 python 块读取然后追加写入

https://stackoverflow.com/questions/5509872/python-append-multiple-files-in-given-order-to-one-big-file

def append_file_to_file(_from, _to):
block_size = 1024*1024
with open(_to, "ab") as outfile, open(_from, "rb") as infile:
while True:
input_block = infile.read(block_size)
if not input_block:
break
outfile.write(input_block)
# Given this building block, you can use:

for filename in ['a.bin','b.bin','c.bin']:
append_file_to_file(filename, 'outfile.bin')

3. use dask. https://rcpedia.stanford.edu/topicGuides/merging_data_sets_dask.html
"""


def main():
# 生成一个大文件
filename = "nums.txt"
# generate_big_random_digits(filename, 1, 1000, 1024 * 1024)
generate_big_incr_digits(filename, 1, 1, 1024 * 1024)
# 分割大文件,用于一些比较大数据处理的问题
split(filename)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions calibre/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.mobi
*.pdf
*.epub
57 changes: 57 additions & 0 deletions calibre/10x程序员工作法.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/python
# encoding: utf-8

from calibre.web.feeds.recipes import BasicNewsRecipe # 引入 Recipe 基础类

"""
教程:
- https://bookfere.com/tools#calibre
- https://www.jianshu.com/p/0bcb92509309
- https://snowdreams1006.github.io/myGitbook/advance/export.html

命令:
ebook-convert 10x程序员工作法.recipe 10x程序员工作法.epub
"""


class Blog(BasicNewsRecipe): # 继承 BasicNewsRecipe 类的新类名

# ///////////////////
# 设置电子书元数据
# ///////////////////
title = "10x程序员工作法" # 电子书名
description = u"10x程序员工作法" # 电子书简介
# cover_url = '' # 电子书封面
# masthead_url = '' # 页头图片
__author__ = "web" # 作者
language = "zh" # 语言
encoding = "utf-8" # 编码

# ///////////////////
# 抓取页面内容设置
# ///////////////////
# keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
no_stylesheets = True # 去除 CSS 样式
remove_javascript = True # 去除 JavaScript 脚本
auto_cleanup = True # 自动清理 HTML 代码
delay = 10 # 抓取页面间隔秒数
max_articles_per_feed = 300 # 抓取文章数量
timeout = 120.0
simultaneous_downloads = 2 # 有时候多个同时下载会失败,修改小点或者单线程下载

# 页面内容解析方法
def parse_index(self):
site="https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/10x%E7%A8%8B%E5%BA%8F%E5%91%98%E5%B7%A5%E4%BD%9C%E6%B3%95"
soup = self.index_to_soup(site)
divtag = soup.findAll("div", {"class": "book-post"})[0]
links = divtag.findAll("li")
articles = []
base = "http://learn.lianglianglee.com"
for link in links:
title = link.a.contents[0].strip()
url = base + '/' + link.a.get("href").lstrip('/')
print(url)
a = {'title': title, 'url': url}
articles.append(a)
ans = [(self.title, articles)]
return ans # 返回可供 Calibre 转换的数据结构
56 changes: 56 additions & 0 deletions calibre/22讲通关Go语言.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/python
# encoding: utf-8

from calibre.web.feeds.recipes import BasicNewsRecipe # 引入 Recipe 基础类

"""
教程:
- https://bookfere.com/tools#calibre
- https://www.jianshu.com/p/0bcb92509309
- https://snowdreams1006.github.io/myGitbook/advance/export.html

命令:
ebook-convert 22讲通关Go语言.recipe 22讲通关Go语言.epub
"""


class Blog(BasicNewsRecipe): # 继承 BasicNewsRecipe 类的新类名

# ///////////////////
# 设置电子书元数据
# ///////////////////
title = "22讲通关Go语言" # 电子书名
description = u"22讲通关Go语言" # 电子书简介
# cover_url = '' # 电子书封面
# masthead_url = '' # 页头图片
__author__ = "web" # 作者
language = "zh" # 语言
encoding = "utf-8" # 编码

# ///////////////////
# 抓取页面内容设置
# ///////////////////
# keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
no_stylesheets = True # 去除 CSS 样式
remove_javascript = True # 去除 JavaScript 脚本
auto_cleanup = True # 自动清理 HTML 代码
# delay = 2 # 抓取页面间隔秒数
max_articles_per_feed = 300 # 抓取文章数量
timeout = 120.0
simultaneous_downloads = 2 # 有时候多个同时下载会失败,修改小点或者单线程下载

# 页面内容解析方法
def parse_index(self):
site = "https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/22%20%E8%AE%B2%E9%80%9A%E5%85%B3%20Go%20%E8%AF%AD%E8%A8%80-%E5%AE%8C"
soup = self.index_to_soup(site)
divtag = soup.findAll("div", {"class": "book-post"})[0]
links = divtag.findAll("li")
articles = []
base = "http://learn.lianglianglee.com"
for link in links:
title = link.a.contents[0].strip()
url = base + '/' + link.a.get("href").lstrip('/')
a = {'title': title, 'url': url}
articles.append(a)
ans = [(self.title, articles)]
return ans # 返回可供 Calibre 转换的数据结构
56 changes: 56 additions & 0 deletions calibre/300分钟吃透分布式缓存.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/python
# encoding: utf-8

from calibre.web.feeds.recipes import BasicNewsRecipe # 引入 Recipe 基础类

"""
教程:
- https://bookfere.com/tools#calibre
- https://www.jianshu.com/p/0bcb92509309
- https://snowdreams1006.github.io/myGitbook/advance/export.html

命令:
ebook-convert 300分钟吃透分布式缓存.recipe 300分钟吃透分布式缓存.epub
"""


class Blog(BasicNewsRecipe): # 继承 BasicNewsRecipe 类的新类名

# ///////////////////
# 设置电子书元数据
# ///////////////////
title = "300分钟吃透分布式缓存" # 电子书名
description = u"300分钟吃透分布式缓存" # 电子书简介
# cover_url = '' # 电子书封面
# masthead_url = '' # 页头图片
__author__ = "web" # 作者
language = "zh" # 语言
encoding = "utf-8" # 编码

# ///////////////////
# 抓取页面内容设置
# ///////////////////
# keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
no_stylesheets = True # 去除 CSS 样式
remove_javascript = True # 去除 JavaScript 脚本
auto_cleanup = True # 自动清理 HTML 代码
# delay = 2 # 抓取页面间隔秒数
max_articles_per_feed = 300 # 抓取文章数量
timeout = 120.0
simultaneous_downloads = 2 # 有时候多个同时下载会失败,修改小点或者单线程下载

# 页面内容解析方法
def parse_index(self):
site= "https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/300%E5%88%86%E9%92%9F%E5%90%83%E9%80%8F%E5%88%86%E5%B8%83%E5%BC%8F%E7%BC%93%E5%AD%98-%E5%AE%8C"
soup = self.index_to_soup(site)
divtag = soup.findAll("div", {"class": "book-post"})[0]
links = divtag.findAll("li")
articles = []
base = "http://learn.lianglianglee.com"
for link in links:
title = link.a.contents[0].strip()
url = base + '/' + link.a.get("href").lstrip('/')
a = {'title': title, 'url': url}
articles.append(a)
ans = [(self.title, articles)]
return ans # 返回可供 Calibre 转换的数据结构
56 changes: 56 additions & 0 deletions calibre/DDD实战课.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/python
# encoding: utf-8

from calibre.web.feeds.recipes import BasicNewsRecipe # 引入 Recipe 基础类

"""
教程:
- https://bookfere.com/tools#calibre
- https://www.jianshu.com/p/0bcb92509309
- https://snowdreams1006.github.io/myGitbook/advance/export.html

命令:
ebook-convert DDD实战课.recipe DDD实战课.epub
"""


class Blog(BasicNewsRecipe): # 继承 BasicNewsRecipe 类的新类名

# ///////////////////
# 设置电子书元数据
# ///////////////////
title = "DDD实战课" # 电子书名
description = u"DDD实战课" # 电子书简介
# cover_url = '' # 电子书封面
# masthead_url = '' # 页头图片
__author__ = "web" # 作者
language = "zh" # 语言
encoding = "utf-8" # 编码

# ///////////////////
# 抓取页面内容设置
# ///////////////////
# keep_only_tags = [{ 'class': 'example' }] # 仅保留指定选择器包含的内容
no_stylesheets = True # 去除 CSS 样式
remove_javascript = True # 去除 JavaScript 脚本
auto_cleanup = True # 自动清理 HTML 代码
# delay = 2 # 抓取页面间隔秒数
max_articles_per_feed = 300 # 抓取文章数量
timeout = 120.0
simultaneous_downloads = 2 # 有时候多个同时下载会失败,修改小点或者单线程下载

# 页面内容解析方法
def parse_index(self):
site = "https://learn.lianglianglee.com/%E4%B8%93%E6%A0%8F/DDD%E5%AE%9E%E6%88%98%E8%AF%BE"
soup = self.index_to_soup(site)
divtag = soup.findAll("div", {"class": "book-post"})[0]
links = divtag.findAll("li")
articles = []
base = "http://learn.lianglianglee.com"
for link in links:
title = link.a.contents[0].strip()
url = base + '/' + link.a.get("href").lstrip('/')
a = {'title': title, 'url': url}
articles.append(a)
ans = [(self.title, articles)]
return ans # 返回可供 Calibre 转换的数据结构
Loading