一道简单的单表代换加密

Coast23

题目描述

已知明文节选自《聊斋志异》, 转为拼音后进行单表加密, 现要根据密文进行解密.

密文如下:

LRZRLKOBNRLVIROZNKOHUKOBRSNRVUIRSNRXVRUVOXVRXYUYSNVOHGVRLROLROLDYIRBRLYOBNROFEKLRBNRURBNRJYKOSNKOHCKOCYBNKOMKOBNRMKOHSNRPROLRURCKOHBNYXYJRKOXVRUVONYBNVLKDPRBYDJYKOSNKOHMVRIRKDTKOHJROHJYVCYGKRKOIRKJRPRLRLROHZNRLRLRNKRLDYQDYBNRIRKJRWROHWROHMKOUKOJYVEKRBNRSNYGKRMVONVSNVOXVRUVOIRKDLYVPRVLKOSNRBRUYLYJYOHYIRKOHBNRLRJRYURZNYRPROHAKOWYDGYLRBNRQDOHPRKOBKRIRKMYCYLDYEYIROHYUVOBNVTKOHIRSYRLYPROZNYUKOBNVOIRJRKOPROKRGVRBNRVUGYBNRMVRUVOXVREYGRSNROFBYDPRZVOFJRVMYEYGYWROHOFLYVJYOSYDLRGYOVOHWVOHEKBNVWYLREYVUSNRHYKOZNYOPRYGKOHSNKOHEYUYJYOBNVJRUVOUYDGYWROHPRVNKOHPYLRTKOHBKOZDOHBNRSNKDPROHMKOHPRJRKDLROSDOHCYPRLYQVSYDOFGYBNRSYDBKRSNVOBNRSKOHSNRBNYVUEKDBNRSNYMYLROHJRNYLROFSYDLROZNYPYNKOSNYIRJRKOBNRBNRBNRJRYZNYHYDEVBNRNYBNRGYEDOHCYLRKRBNYOFOKRIRKLYVJYOBKRGYWROHEKOHIRKOHLDOHJYVLROSNRBNRPRAROHZNYAYBNRJYURLYKDIRVUTKOHLRSNYGYSNYPYOFGYBKRBVPRVJYKOTRYTKOQDOHMVROFJYVLROPYNKOSNYERGKJYKOBKNYOWKSYDLRXRBNRLRUREYNKOOFBNRJROHGYBNRJYVNYEYBNRJRLKOJYKOVUOFLRMKOHLREKJYXROHSDYBNYJYKOXRKDGYQVEVJRUVOHLYNKOSNYGKJYKOBNDOHEVBNRLVSNYGYSNYKOHLROBKRGKRBNYSNRGYCYEYOFOKRIRKLYBNRLRLYVSKOURGYHDOHEKOHCYPYBNRSKOURNYLRJYLROHOFVUBROFOKRIRSNDYLRIRKOSYDIRKOMYURHDOHLRPYTKOHSNDYLROHXYBNYMYIRKWKJRJRYBNRSYRSNDYLROHJRVGYJYVHYMYOFOKRURLYLROGDTKOHSYRTVVUMKOHEYOFLDYBDOHBNRZNYXVOSNRJRVQVLDYZRWRWKOHBNRXROHGKDBNYOFLYVBRQVLRZNYVUSNRLR

解密过程

单表加密, 最常见的做法就是做词频分析.

如果是英文文本的话, 用quipqiup很快就能得解.

但这里是中文拼音, 似乎没有现成的工具, 那就老老实实做词频分析.

统计聊斋全书的拼音词频

首先要知道中文拼音的一般频率分布, 这里我直接网上找《聊斋志异》的txt文本.

去除前言什么的, 只留下小说部分, 保存txt为sample.txt, 运行如下代码 (GPT-4o)

import re
from collections import Counter
from pypinyin import lazy_pinyin

# 函数:读取文本文件并去除标点符号、空格等非中文字符
def clean_text(filename):
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
# 使用正则表达式去除标点符号和非汉字字符
text = re.sub(r'[^\u4e00-\u9fa5]', '', text) # 只保留中文字符
return text

# 函数:提取拼音声母和韵母
def extract_initial_and_final(word):
# 声母,包括组合声母
initials = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'z', 'c', 's', 'r', 'zh', 'ch', 'sh']
finals = ['a', 'o', 'e', 'i', 'u', 'v', 'ai', 'ei', 'ui', 'ao', 'ou', 'iu', 'ie', 've', 'er', 'an', 'en', 'in', 'un', 'ün',
'ang', 'eng', 'ing', 'ong']

# 使用pypinyin来获取拼音
pinyin_list = lazy_pinyin(word)

result = {'initials': [], 'finals': []}

for p in pinyin_list:
# 优先处理组合声母 zh, ch, sh
for initial in ['zh', 'ch', 'sh']:
if p.startswith(initial): # 优先匹配组合声母
result['initials'].append(initial)
p = p[len(initial):] # 去掉已经匹配的声母部分
break

# 处理剩余的普通声母
for initial in ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'z', 'c', 's', 'r']:
if p.startswith(initial):
result['initials'].append(initial)
p = p[len(initial):] # 去掉已经匹配的声母部分
break

# 处理韵母
for final in finals:
if p.endswith(final):
result['finals'].append(final)
break

return result

# 函数:统计拼音声母和韵母的频率,同时生成拼音大写形式
def count_pinyin(text):
initial_counter = Counter()
final_counter = Counter()
pinyin_list = [] # 存储拼音的大写形式

# 遍历文本中的每个字
for word in text:
# 提取每个字的声母和韵母
result = extract_initial_and_final(word)
# 更新频率统计
initial_counter.update(result['initials'])
final_counter.update(result['finals'])

# 将拼音转换为大写并保存到pinyin_list
pinyin_list.extend(lazy_pinyin(word))

# 拼音大写并去除分隔符
pinyin_text = ''.join([p.upper() for p in pinyin_list])

# 按照频率从大到小排序
sorted_initials = initial_counter.most_common() # 返回排序后的声母
sorted_finals = final_counter.most_common() # 返回排序后的韵母

return sorted_initials, sorted_finals, pinyin_text

# 主函数
def main():
filename = 'sample.txt' # 中文文本文件路径
text = clean_text(filename) # 清理文本

initial_freq, final_freq, pinyin_text = count_pinyin(text) # 统计声母和韵母频率,并获取拼音大写文本

# 输出声母频率,按频率从大到小排序
print("声母频数统计:")
for initial, count in initial_freq:
print(f"{initial}: {count}")

# 输出韵母频率,按频率从大到小排序
print("\n韵母频数统计:")
for final, count in final_freq:
print(f"{final}: {count}")

# 将拼音大写文本写入到pinyin.txt文件中
with open('pinyin.txt', 'w', encoding='utf-8') as pinyin_file:
pinyin_file.write(pinyin_text)

if __name__ == '__main__':
main()

得到如下排好序的频数表:

声母频数统计:
zh: 29643
sh: 28934
j: 28765
q: 18439
x: 18200
b: 17367
r: 14622
d: 14158
l: 13905
g: 13834
h: 13649
z: 12934
n: 11449
f: 11430
m: 10998
s: 10567
ch: 9473
t: 7654
c: 6688
k: 5939
p: 3464

韵母频数统计:
i: 111316
u: 76706
an: 32737
e: 32492
o: 26338
ang: 19494
en: 14397
ing: 12005
ong: 11837
a: 11427
eng: 11316
in: 8509
er: 6659
un: 5059
v: 3797

统计密文的词频

将密文保存到ciphertext.txt, 运行如下代码 (GPT-4o):

def getNWordList(n):
"""
生成 N 元字符频率列表,并保存到文件。
:param n: N 元词,1表示单字母,2表示二元字母,3表示三元字母,依此类推
:return: 返回频率列表
"""
c_file = open('ciphertext.txt') # 读取文件
c_text = c_file.read() # 读取文本
char_list = list(c_text) # 转化为列表,每个字母为一个元素

word_list = []

# 处理 n = 1 的情况
if n == 1:
word_list = char_list # 直接使用字符列表
else:
# 获取 N 元字符组合
for i in range(0, len(char_list) - n + 1): # 从当前位置开始取 n 个字符
temp_list = char_list[i:i + n] # 获取连续 n 个字符
temp_str = "".join(temp_list) # 将这些字符组合成一个字符串

if len(temp_str) > 1: # 确保字符组合的长度大于1
word_list.append(temp_str)

# 统计加密字符串中各个 N 元字符的出现次数
tempSet = set(word_list) # 转为集合去重

# 保存为字典,key:字母组合,value:出现次数
tempDict = {}
for i in tempSet:
tempDict[i] = word_list.count(i)

# 列表排序, 以元组形式
dict_sorted = sorted(tempDict.items(), key=lambda x: x[1], reverse=True)

# 输出到文件
file_name = f'fre_{n}.txt'
with open(file_name, 'w') as f:
print(f"{n}元字母", "出现次数", "\t频率", file=f)
for i in dict_sorted:
if i[1] > 1: # 检查字母组合出现次数大于1
print(f"{i[0]}\t\t{i[1]}\t\t{i[1] / len(c_text)}", file=f)

# 返回频率列表
frequency_list = [i[0] for i in dict_sorted if i[1] > 1]
return frequency_list

# 调用函数,获取各个元词频
getNWordList(1) # 1 元词
getNWordList(2) # 2 元词
getNWordList(3) # 3 元词
getNWordList(4) # 4 元词
getNWordList(5) # 5 元词
getNWordList(6) # 6 元词

得到如下结果 (只截取部分):

1元字母 出现次数 	频率
R 197 0.15354637568199533
Y 148 0.11535463756819954
O 141 0.10989867498051442
N 99 0.07716289945440374
K 98 0.07638347622759158
V 68 0.05300077942322681
H 62 0.048324240062353856
B 62 0.048324240062353856
L 62 0.048324240062353856
...

很容易推断出R = i, KOH = ang, BN = zh / sh等.

但随即我发现, 根本没有必要根据词频猜测破解密码表.

pinyin.txt, 也就是《聊斋志异》全文的拼音,
只有112w个字符.

因此, 我们可以直接O(N)暴力匹配密文片段!

对于这样的密文片段:
BNROFEKLRBNRURBNRJYKOSNKOHCKOCYBNKOMKOBNR

它的pattern也就是

ABC {6X} ABC {2X} ABC {21X} ABC 
#include <stdio.h>
#include <string.h>

char str[1130000];
int indent[5] = {6, 2, 21};

_Bool check(int i){
// 没错, 很暴力, 主打一个能跑就行
return (str[i] == str[i+indent[0]+3] &&
str[i] == str[i+indent[0]+3+indent[1]+3] &&
str[i] == str[i+indent[0]+3+indent[1]+3+indent[2]+3] &&

str[i+1] == str[i+1+indent[0]+3] &&
str[i+1] == str[i+1+indent[0]+3+indent[1]+3] &&
str[i+1] == str[i+1+indent[0]+3+indent[1]+3+indent[2]+3] &&

str[i+2] == str[i+2+indent[0]+3] &&
str[i+2] == str[i+2+indent[0]+3+indent[1]+3] &&
str[i+2] == str[i+2+indent[0]+3+indent[1]+3+indent[2]+3]);
}

int main(){
freopen("pinyin.txt", "r", stdin);
fgets(str, 1130000, stdin);
fclose(stdin);
for(int i = 0; i < strlen(str) - 100; ++i){
if(check(i)){
for(int j = i; j < i + 100; ++j){
putchar(str[j]);
}
puts("");
}
}
}

在原文中有唯一匹配, 对应的原文拼音为:

ZHINVDAYIZHIRIZHIJUANSHANGFANFUZHANWANZHI

pinyin.txt中搜索该串字符, 很容易找出密文对应的原文拼音为:

YICIYANZHIYEXINCHANGRANZISHIERXISHIMEIRENMEIMURUSHENGBEIYINYINYOUXIZIYUNZHINVDAYIZHIRIZHIJUANSHANGFANFUZHANWANZHIWANGSHIQINYIRIFANGZHUMUJIANMEIRENHUZHEYAOQIZUOJUANSHANGWEIXIAOLANGJINGJUEFUBAIANXIAJIQIYIYINGCHIYIYIHAIYOUKOUZHIXIAJITINGTINGWANRANJUEDAIZHISHUBAIWENHESHENMEIRENXIAOYUEQIEYANSHIZIRUYUJUNGUXIANGZHIYIJIURICHUIQINGPANTUOBUYIZHIKONGQIANZAIXIAWUFUYOUDUXINGURENZHELANGXISUIYUQINCHURANZHENXIJIANQINAIBEIZHIERBUZHIWEIRENMEIDUBISHINVZUOQICENVJIEWUDUBUTINGNVYUEJUNSUOYIBUNENGTENGDAZHETUYIDUERSHIGUANCHUNQIUBANGSHANGDURUJUNZHEJIRENRUOBUTINGQIEXINGQUYILANGZANCONGZHISHAOQINGWANGQIJIAOYINSONGFUQIYUKESUONVBUZHISUOZAISHENZHISANGSHIZHUERDAOZHISHUWUYINGJIHUYINVSUOYINCHUQUHANSHUXIJIANZHIZHIZHIJIUCHUGUODEZHIHUZHIBUDONGFUYIAIZHUNVNAIXIAYUEJUNZAIBUTINGDANGXIANGYONGJUEYINSHIZHIQIPINGCHUPUZHIJURIYUAOXIERLANGYISHUBUSHUQUNVBUZAIZEQIEJUANLIULANKONGWEINVJUEYINQUHANSHUDIBAJUANZAHUNTASUOYIMIZHIYIRIDUHANNVZHIJINGBUZHIJUEHUDUZHIJIYANJUANERNVYIWANGYIDAJUMINGSOUZHUJUANMIAOBUKEDEJIRENGYUHANSHUBAJUANZHONGDEZHIYESHUBUSHUANGYINZAIBAIZHUSHIBUFUDUNVNAIXIAYUZHIYIYUESANRIBUGONGDANGFUQUZHISANRIHUYIJUYINGNVERZINVNAIXISHOUYIXIANSUOXIANWURIGONGYIQULANGSHOUYINGMUZHUWUXIATAJIJIUZHISUISHOUYINGJIEBUJUEGUWUNVNAIRIYUYINBOLANGSUILEERWANGDUNVYOUZONGZHICHUMENSHIJIEKEYOUCITITANGZHIMINGBAOZHUNVYUEZIKEYICHUERSHIYI

根据拼音获取原文

水平过低, 一眼看不出原文.
只能写代码跑了…
废物GPT-4o写不出能跑的代码, 还得我自己写.

import re
from pypinyin import lazy_pinyin

def getText(file: str) -> str:
# 读取原文, 并过滤掉非中文字符
with open(file, "r", encoding = "utf-8", errors = "ignore") as f:
text = f.read()
text = re.sub(r"[^\u4e00-\u9fa5]", "", text)
return text

def getPinyin(text: list) -> str:
# 文本转拼音
pinyin:list = lazy_pinyin(text)
return "".join(pinyin)

def find(pattern: str, text: str, length:int = 14) -> str:

for i in range(len(text) - length + 1):
tmp = text[i : i + length]
tmp_pinyin = getPinyin(tmp)
if tmp_pinyin == pattern: return tmp

return "匹配失败."

def main():

file = "sample.txt"
pattern = "zhinvdayizhirizhijuanshangfanfuzhanwanzhi" # 14~15个拼音, 取决于juan还是ju an
text = getText(file)
match = find(pattern, text)
print(f"Result: {match}")

if __name__ == "__main__":
main()
Result:  织女大异之日置卷上反复瞻玩至

很容易就定位到原文了:

...以此验之耶?”心怅然自失。而细视美人,眉目如生;背隐隐有细字云:“织女。”大异之。日置卷上,反复瞻玩,至忘食寝。一日方注目间,美人忽折腰起,坐卷上微笑。郎惊绝,伏拜案下。既起,已盈尺矣。益骇,又叩之。下几亭亭,宛然绝代之姝。拜问:“何神?”美人笑曰:“妾颜氏,字如玉,君固相知已久。日垂青盼,脱不一至,恐千载下无复有笃信古人者。”郎喜,遂与寝处。然枕席间亲爱倍至,而不知为人。

每读必使女坐其侧。女戒勿读,不听;女曰:“君所以不能腾达者,徒以读耳。试观春秋榜上,读如君者几人?若不听,妾行去矣。”郎暂从之。少顷忘其教,吟诵复起。逾刻索女,不知所在。神志丧失,嘱而祷之,殊无影迹。忽忆女所隐处,取《汉书》细检之,直至旧处,果得之。呼之不动,伏以哀祝。女乃下曰:“君再不听,当相永绝!”因使治棋枰、樗蒲之具,日与遨戏。而郎意殊不属。觑女不在,则窃卷流览。恐为女觉,阴取《汉书》第八卷,杂混他所以迷之。一日读酣,女至竟不之觉;忽睹之,急掩卷而女已亡矣。大惧,冥搜诸卷、渺不可得;既,仍于《汉书》八卷中得之,页数不爽。因再拜祝,矢不复读。

女乃下,与之弈,曰:“三日不工,当复去。”至三日,忽一局赢女二子。女乃喜,授以弦索,限五日工一曲。郎手营目注,无暇他及;久之随手应节,不觉鼓舞。女乃日与饮博,郎遂乐而忘读,女又纵之出门,使结客,由此倜傥之名暴著。女曰:“子可以出而试矣。”

摘自《聊斋志异》卷十一·书痴

求密码表

with open("en.txt", "r") as f:
en = f.read()
with open("de.txt", "r") as f:
de = f.read()

key_list = {}

for i in range(len(en)):
if en[i] not in key_list:
key_list[en[i]] = de[i]
else:
if de[i] != key_list[en[i]]:
print(f"Error at position {i}: The mapping is {en[i]} -> {key_list[en[i]]} but {de[i]} is expected.")

for i in range(65, 91):
if chr(i) not in key_list:
print(i, "not found")
else:
print(f"{chr(i)} -> {key_list[chr(i)]}")
Error at position 545: The mapping is N -> H but X is expected.
Error at position 546: The mapping is K -> A but I is expected.
A -> P
B -> Z
C -> F
D -> O
E -> D
F -> V
G -> B
H -> G
I -> X
J -> J
K -> A
L -> Y
M -> W
N -> H
O -> N
P -> Q
Q -> K
R -> I
S -> S
T -> L
U -> R
V -> E
W -> T
X -> M
Y -> U
Z -> C

有2个字符没对应上, 应该是多音字导致的, 无伤大雅.

密码表如下:

映射前映射后
AP
BZ
CF
DO
ED
FV
GB
HG
IX
JJ
KA
LY
MW
NH
ON
PQ
QK
RI
SS
TL
UR
VE
WT
XM
YU
ZC

找出多音字

学长希望找到这个多音字, 随便写了份代码:

import re
from collections import Counter
from pypinyin import lazy_pinyin

def clean_text(filename):
with open(filename, "r", encoding="utf-8") as f:
text = f.read()
text = re.sub(r"[^\u4e00-\u9fa5]", "", text) # 只保留中文字符
return text

def go():
text = clean_text("ans.txt")
pinyin_list = ""
for i in text:
ipinyin:list = lazy_pinyin([i])
pinyin_list += "".join(ipinyin)
if(len(pinyin_list) >= 540 and len(pinyin_list) <= 550): print(i, end = " ")

if __name__ == "__main__":
go()

输出听 妾 行, 可知是多音字. lazy_pinyin提供的拼音为XING而密文里的拼音是HANG. 显然正确发音应为XING.

  • 标题: 一道简单的单表代换加密
  • 作者: Coast23
  • 创建于 : 2024-12-19 19:16:29
  • 更新于 : 2025-01-21 14:48:29
  • 链接: https://coast23.github.io/2024/12/19/一道简单的单表代换加密/
  • 版权声明: 本文章采用 CC BY-NC-SA 4.0 进行许可。
评论