base大家族詳解
1 簡介
對於base編碼的使用,參考Base What? A Practical Introduction to Base Encoding。
- base2: 2進制
- base10: 10進制
- base16: 計算機字節表示
- base32: 字母加數字
- base64: 更大的表示范圍
- base58: bitcoin為了比base64更易讀采用的表示
這些base編碼主要是在計算機的二進制表示,和人可讀的字符表示之間轉換。
具體實現可以分為2種,能利用bit位直接轉換的base(例如base2, base16,base32, base64),和不能利用bit位直接轉換的base(例如base10, base58)。
2 實現代碼
全部采用clojure實現,主要是為了理解base編碼,只求代碼容易理解,性能不是很好:
(ns base) (defn indices-of [f coll] (keep-indexed #(if (f %2) %1 nil) coll)) (defn first-index-of [f coll] (first (indices-of f coll))) (defn find-thing "查找`coll`中第一個等於value的索引" [value coll] (first-index-of #(= % value) coll)) (def divmod "返回[商 余數]" (juxt quot mod)) ;; 1. 不能利用bit位直接轉換的base ;; 基本編碼方法就是把輸入作為一個數字,不斷的對baseN的N進行除法運算, ;; 每次的余數就是base-table的索引,商作為下一次的除數進行計算。直到除盡 (defn base-encode-num "base編碼一個數字`n` `leading-zeros`為padding個數 `base-table`為編碼轉換表" [n leading-zeros base-table] (let [base-count (count base-table)] (loop [n n result ""] (if (>= n base-count) (let [[d m] (divmod n base-count)] (recur d ;; 編碼是從后往前進行的,每次除以baseN的N, ;; 以余數作為索引 (-> (nth base-table m) (str result)))) (->> (if (pos? n) (-> (nth base-table n) (str result)) result) ;; 注意,因為是從后往前進行生成的,padding在前面添加 (concat (repeat leading-zeros (first base-table))) (apply str)))))) (defn base-enc "編碼base10,base36,base58等" [base-table s] (let [s-bytes (.getBytes s) leading-zeros (->> s-bytes (take-while zero?) count)] (-> (java.math.BigInteger. 1 s-bytes) (base-encode-num leading-zeros base-table)))) (defn invert-table "生成`table`序列的反向索引表 " [table] (into {} (map #(vector %1 %2) table (iterate inc 0)))) ;; baseN的解碼方法就是對輸入字符串的每個字符查找索引 * (N ** 輸入串反向順序的索引), ;; 結果累加就是目標數字,再把數字還原為字符串 (defn base-dec "解碼base10,base36,base58等" [base-table s] (let [padding (->> s (take-while #(= % (first base-table))) (map (constantly (byte 0)))) inverted-table (invert-table base-table) base-count (count base-table)] (->> (reverse s) (reduce (fn [[r m] c] [(+ r (*' m (inverted-table c))) (*' m base-count)]) [0 1]) first str java.math.BigInteger. .toByteArray (drop-while zero?) (concat padding) byte-array String.))) (def base58-table "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz") (def enc-base58 (partial base-enc base58-table)) (def dec-base58 (partial base-dec base58-table)) (def base36-table "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ") (def enc-base36 (partial base-enc base36-table)) (def dec-base36 (partial base-dec base36-table)) (def base10-table "0123456789") (def enc-base10 (partial base-enc base10-table)) (def dec-base10 (partial base-dec base10-table)) (comment (def target "D9cS9N9iHjMLTdA8YSMRMp") (def r (dec-base58 target)) (= target (enc-base58 r)) (= "64021609848" (enc-base36 "testabc")) (enc-base10 "this is a test") ;; => "2361031878030638688519054699098996" (dec-base10 "2361031878030638688519054699098996") ;; => "this is a test" ) ;;; ============= 以2為基的base算法,base2, base4, base16, base32, base64 ... ;;; 可以利用bit位直接變換的base ;;; 算法把輸入的8bit串轉換為base(2^n)的nbit串,不需要做除法, ;;; 編碼過程 直接把輸入字符串轉換為bit序列,再根據baseN的位寬(如base64 6位,base32 5位)重新划分, ;;; 查表獲取目標串,不需要除法,因此要比上面的base算法速度快 ;;; 下面統一稱為base2 (defn bits "轉換數字為bit序列,如果不指定`width`,默認為8位" ([n] (bits n 8)) ([n width] (reverse (map #(bit-and (bit-shift-right n %) 1) (range width))))) (defn numb "bit序列轉換為數字" [bits] (BigInteger. (apply str bits) 2)) (defn string-to-bits "字符串轉換為bit序列" [msg] (->> (.getBytes msg) (map #(bits %)) flatten)) (defn padding-count "獲取字節序列`v`的padding個數" [v] (->> (vec v) rseq (take-while zero?) count)) (defn drop-padding "去掉字節序列`v`的padding" [v] (-> (padding-count v) (drop-last v))) (defn log2 [n] (/ (Math/log n) (Math/log 2))) (defn get-bit-width "獲取`base-table`的位寬,即用多少字節表示一個字符" [base-table] (let [width (-> (count base-table) log2)] (when (= width (Math/floor width)) (int width)))) (defn valid-base2-table? "檢測是否為有效的base2表" [base-table] (-> (get-bit-width base-table) nil? not)) (defn byte-align "根據位寬bit-width,獲取需要對齊的base字節數 比如一個字節是8位,轉換base64,每個base64字節為6位 byte-align就是確定最低需要多少個base64的6位字節才能對齊到8位字節" [bit-width] (let [max-padding 12 fn-len-range (fn [width] (map (partial * width) (range 1 max-padding))) hex-range (fn-len-range 8) bit-range (fn-len-range bit-width) align-value (->> (clojure.set/intersection (set hex-range) (set bit-range)) (apply min))] (/ align-value bit-width))) (defn hex-byte-align "獲取最低需要多少個hex字節才能對齊" [bit-width] (-> (byte-align bit-width) (* bit-width) (/ 8))) (comment (byte-align 6) ;; => 4 (hex-byte-align 6) ;; => 3 ;; base64需要4個6位字節才能對齊到3個8位hex字節,即 4 * 6 = 3 * 8 (byte-align 5) ;; => 8 (hex-byte-align 5) ;; => 5 ;; base32需要8個5位字節才能對齊到5個8位hex字節, 即 8 * 5 = 5 * 8 (hex-byte-align 4) ;; => 1 (hex-byte-align 2) ;; => 1 (hex-byte-align 1) ;; => 1 ;; 可以看到base2, base4,base16 可以對齊到1個字節,不需要padding ) (defn gen-padding "生成`data`的padding串" [bit-width data] (let [len (count data) align (byte-align bit-width) aligned-byte (mod len align)] (if (zero? aligned-byte) "" (apply str (-> (- align aligned-byte) (repeat "=")))))) (defn base2-enc "編碼base2類型的字符串`s` `base-table`必須是以2為基的base表,base(2^n),例如base16,base32,base64" [base-table s] {:pre [(valid-base2-table? base-table)]} (let [bits (string-to-bits s) base2-char #(->> (numb %) (nth base-table)) bit-width (get-bit-width base-table) data (partition bit-width bit-width (repeat 0) bits)] (str (->> (map base2-char data) (apply str)) (gen-padding bit-width data)))) (defn base2-dec "base2解碼,`s`為要解碼的字符串 `base-table` 解碼表" [base-table s] {:pre [(valid-base2-table? base-table)]} (let [bit-width (get-bit-width base-table) inverted-table (merge (invert-table base-table) {\= 0} ;; 添加padding字符 )] (->> (mapcat #(some-> (inverted-table %1) (bits bit-width)) s) ;; mapcat自動過濾掉base2-bits的nil值,即忽略s串中不屬於base-table的字符 (partition 8 8 (repeat 0)) (map numb) drop-padding (map char) (apply str)))) ;; base64,base32 需要使用base2-enc dec (def base64-table "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/") (def enc-base64 (partial base2-enc base64-table)) (def dec-base64 (partial base2-dec base64-table)) (def base32-table "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567") (def enc-base32 (partial base2-enc base32-table)) (def dec-base32 (partial base2-dec base32-table)) ;; base16等同於hex轉換 (def base16-table "0123456789ABCDEF") (def enc-base16 (partial base2-enc base16-table)) (def dec-base16 (partial base2-dec base16-table)) ;; base2是2進制表示 (def base2-table "01") (def enc-base2 (partial base2-enc base2-table)) (def dec-base2 (partial base2-dec base2-table)) ;; 自動定義base的宏 (defmacro defbase "根據base-table生成base編碼和解碼函數的定義" [base-name base-table] (let [table-name (symbol (str base-name "-table")) enc-fname (symbol (str "enc-" base-name)) dec-fname (symbol (str "dec-" base-name)) def-table `(def ~table-name ~base-table)] (if-let [bit-width (get-bit-width base-table)] `(do ~def-table (def ~enc-fname (partial base2-enc ~base-table)) (def ~dec-fname (partial base2-dec ~base-table))) `(do ~def-table (def ~enc-fname (partial base-enc ~base-table)) (def ~dec-fname (partial base-dec ~base-table)))))) ;; (defbase base64 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/") ;; (defbase base32 "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567") ;; (defbase base16 "0123456789ABCDEF") ;; (defbase base2 "01") ;; (defbase base58 "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz") ;; (defbase base36 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ") ;; (defbase base10 "0123456789") (defbase base8 "01234567") (comment base8-table ;; => "01234567" (enc-base8 "test 1") ;; => "3506256335020061" (dec-base8 *1) ;; => "test 1" (= 2 (padding-count [1 2 0 3 0 0])) (= 0 (padding-count [1 2 3])) (= 0 (padding-count [])) (= [1 2 0 3] (drop-padding [1 2 0 3 0 0])) (= "I" (base2-dec base64-table "SQ==")) (= "AM" (base2-dec base64-table "QU0=")) (base2-enc base64-table "I") ;; => "SQ==" (base2-enc base64-table "AM") ;; => "QU0=" (String. (base64/encode "I")) ;; => "SQ==" (String. (base64/encode "AM")) ;; => "QU0=" (= "this is a test" (base2-dec base32-table "ORUGS4ZANFZSAYJAORSXG5A=")) (base32/encode "this is a test") ;; => "ORUGS4ZANFZSAYJAORSXG5A=" (base2-enc base32-table "this is a test") ;; => "ORUGS4ZANFZSAYJAORSXG5A=" ;; base-table不符,產生異常 (= "base58_is_boring" (base2-dec base58-table "D9cS9N9iHjMLTdA8YSMRMp")) (= 6 (get-bit-width base64-table)) (= 5 (get-bit-width base32-table)) (= nil (get-bit-width base58-table)) (enc-base32 "I") ;; base32的padding比較多 ;; => "JE======" (dec-base32 *1) ;; => "I" ;; base16相當於內存hex表示 (enc-base16 "this is a test") ;; => "7468697320697320612074657374" (dec-base16 "7468697320697320612074657374") ;; => "this is a test" (dec-base16 "7468IIOO697320697320612074657374") ;; 不合法字符直接忽略 ;; => "this is a test" (codecs/bytes->hex (codecs/str->bytes "this is a test")) ;; => "7468697320697320612074657374" ;; base2相當於內存的二進制表示 (enc-base2 "t") ;; => "01110100" ;; \t的二進制 (Integer/toString (int \t) 2) ;; => "1110100" (dec-base2 "01110100") ;; => "t" ;; 注意解碼時前面的0是必要的 (def s1 "this is a test") (= (enc-base64 s1) (String. (base64/encode s1))) (= (enc-base32 s1) (base32/encode s1)) ) ;;; ============== 自動base解碼 (defn valid-base-str? "檢查字符串s是否符合base-table" [base-table s] (clojure.set/subset? (set s) (-> (set base-table) (conj \=)))) (defn guess-base [text] (cond ;; 這里必須按從小到大的順序測試 (valid-base-str? base16-table text) :base16 (valid-base-str? base32-table text) :base32 (valid-base-str? base64-table text) :base64 :else :unknown)) (defn decode "自動base解碼" [text & {:keys [step debug] :or {step 1 debug false}}] (loop [text text step step] (when debug (println "step" step "decode:" text)) (case (guess-base text) :base16 (do (println "step" step "--> base16") (-> (dec-base16 text) (recur (inc step)))) :base32 (do (println "step" step "--> base32") (-> (dec-base32 text) (recur (inc step)))) :base64 (do (println "step" step "--> base64") (-> (dec-base64 text) (recur (inc step)))) :unknown (do (println "result:" text) text)))) (comment (def text "3441353234343435353735323442353634423539354134353336353333323536343735323444343535333537344234433441343634353535353535323533343734413335344134363439353534423530344135413437353634463444353334463441344534443435333235363533353534423531354134363431353334413335") ;; 經測試,直接轉換bits對於長字符串速度比較慢 (decode text) ;; step 1 --> base16 ;; step 2 --> base16 ;; step 3 --> base32 ;; step 4 --> base32 ;; step 5 --> base64 ;; step 6 --> base64 ;; result: key:hi!venus ;; => "key:hi!venus" )
3 結論
base編碼基本是碼表轉換,產生人可讀的字符表示,不適合加密。
Created: 2019-04-11 四 10:28