划詞標注1——使用svg繪制換行文本並自動識別庫中字典數據

本文轉載自查看原文 2019-11-26 14:43 297 SVG

業務需求

給出一段文本，自動識別出文本中包含的關鍵字信息，關鍵字是庫里已知的數據，根據類型的不同顯示出不同的顏色

業務分析

1）采用css：文本識別出來后，根據識別出的文本更改對應文本的dom，通過更改css來實現
缺點：比較麻煩，只能標注顏色，不易於維護和拓展
2）采用svg：使用svg繪制文本，繪制顏色標注
優點：比較靈活，便於后續拓展

實現效果

實現難點

想象很美好，實現很骨感，代碼實現的過程中遇到了不少問題，這里記錄下解決方法。本文主要粘貼核心代碼，不是全部的業務代碼哦~

1、svg的換行顯示

svg是不支持換行顯示的
設計思路：需要自動識別字典文本，使用到文本的下標，根據下標位置來進行保存和標注。因此將每個文本都單獨設置成一個tspan，由於識別的文字包含漢字，英文字母，符號等，所以相應的文字給予對應的長度。獲取svg的最大顯示寬度，當文本的寬度>svg寬度的時候實行自動換行。

1.1 文本寬度的設定
不同的文本的寬度不一樣，漢字、符號還好可以給個統一的設置，但是英文字母，有的寬，有的窄，如果設置成一樣的，顯示會很怪，這里經過測試，獲取了一個正常顯示的范圍值。（本文項目代碼基於vue.js）
定義常量如下：

   // 獲取文本的寬度
    getTxtWidth(text) {
      let smallEnglishRegx = /^[a-z]+$/; // 小寫
      let bigEnglishRegx = /^[A-Z]+$/; // 大寫
      let numberRegx = /^[0-9]$/; // 數字
      let chinaRegx = /[\u4E00-\u9FA5\uF900-\uFA2D]/; // 中文
      let unitWidth;

      if (chinaRegx.test(text)) {
        unitWidth = this.chWidth;
      } else if (/\s/.test(text)) {
        unitWidth = this.spaceWidth;
      } else if (smallEnglishRegx.test(text)) {
        unitWidth = this.enWidth;
      } else if (bigEnglishRegx.test(text)) {
        unitWidth = this.bigEnWidth;
      } else if (numberRegx.test(text)) {
        unitWidth = this.numWidth;
      } else {
        unitWidth = this.charWidth;
      }
      // 特殊文本的特殊處理
      if (this.smallerLetter1.includes(text)) {
        unitWidth = unitWidth - 1;
      }
      if (this.smallerLetter2.includes(text)) {
        unitWidth = unitWidth - 2;
      }
      if (this.smallerLetter4.includes(text)) {
        unitWidth = unitWidth - 4;
      }
      if (this.bigLetter1.includes(text)) {
        unitWidth = unitWidth + 1;
      }
      if (this.bigLetter2.includes(text)) {
        unitWidth = unitWidth + 2;
      }
      if (this.bigLetter4.includes(text)) {
        unitWidth = unitWidth + 4;
      }
      return unitWidth;
    }

1.2 文本的拆分
默認的換行，需要顯示。超出svg區域的，手動換行。具體實現看代碼，這里使用的是svg.js庫用於繪制svg圖形。

chunkWords() {
      this.dataChunk = [];
      let text = this.words;
      
      // 按換行符號換行
      text = text.replace("↵↵", "\n");
      text = text.replace("↵", "\n");
      let sentenceArr = text.split("\n");

      for (let i = 0, len = sentenceArr.length; i < len; i++) {
        // 先按空格分開
        let wordsArr = sentenceArr[i].split(" ");

        // 再把每個字都分了
        let wordsArrCopy = [];
        for (let j = 0, len2 = wordsArr.length; j < len2; j++) {
          // 判斷是否包含中文，如果包含中文再繼續拆分
          let unit = wordsArr[j];
          for (let k = 0, len3 = unit.length; k < len3; k++) {
            let firstword = unit.slice(k, k + 1);

            // 插入
            wordsArrCopy.push(firstword);
          }

          // 空格也要加上
          wordsArrCopy.push(" ");
        }
        sentenceArr[i] = wordsArrCopy;
      }

      // 再加上換行符，用於后面的換行，SVG文本不支持自己換行
      for (let i = 0, len = sentenceArr.length; i < len; i++) {
        let item = sentenceArr[i];
        let length = item.length;
        // 判斷最后一個是不是有字
        let lastWord = item[length - 1].trim();
        // 有字則新增個
        if (lastWord) {
          item[length] = "↵↵";
        } else {
          item[length - 1] = "↵↵";
        }
      }

      // 對每行再進行拆分，如果大於svg的寬度后再進行換行
      // this.dataChunk = [[],[]]
      this.chunkIndex = 0;
      for (let i = 0, len = sentenceArr.length; i < len; i++) {
        // 獲取當前this.dataChunk[index]的總長度，大於等於this.svgX+currentWidth的時候加行；
        // 每個span是一個對象，包含一些字段信息。一個對象對應一個tspan
        let sentence = sentenceArr[i];

        // 需要換行的情況
        if (this.chunkIndex > 0 && this.dataChunk[this.chunkIndex].length) {
          this.chunkIndex++;
        }

        for (let j = 0, len2 = sentence.length; j < len2; j++) {
          let unit2 = sentence[j];
          let unitWidth = this.getTxtWidth(unit2);
          this.insertDataChunk(unit2, unitWidth);
        }
      }

      this.drawText();
    },

    // 根據分片繪制文本
    drawText() {
      this.textGroup.clear();
      let that = this;
      for (let i = 0, len = this.dataChunk.length; i < len; i++) {
        let item = this.dataChunk[i];

        this.textGroup
          .text(function(add) {
            for (let j = 0, len2 = item.length; j < len2; j++) {
              let unit = item[j];
              if (i === 0) {
                item[j].offset = j;
                item.allOffset = j;
              } else {
                let pos = that.dataChunk[i - 1].allOffset + j + 1;
                item[j].offset = pos;
                item.allOffset = pos;
              }

              item[j].row = i;
              item[j].index = j;

              // 記錄下來
              that.textDom[item[j].offset] = add
                .tspan(unit.text)
                .attr("x", unit.dx)
                .attr("y", unit.dy)
                .data("offset", item[j].offset);
            }
          })
          .data("row", i);
      }
    }

實現出來的效果：

這樣，每個文字都被拆成一個tspan並包含對應的data-offset屬性了。

2、識別庫中數據

庫里的數據分不同的類型，以數組形式顯示，類似這樣：

根據給的文本，如果包括了數組中的數據，則高亮顯示：

// check語句，將句子中已有的實體/關系/操作/屬性識別出來
    checkWord() {
      // 對換行符號進行相同的處理
      let words = this.words;
      words = words.replace("↵↵", "\n");
      words = words.replace("↵", "\n");
      words = words.split("\n");
      this.words = words.join("");

      // 識別關系
      for (let i = 0, len = this.relationArr.length; i < len; i++) {
        this.setKnownData(this.relationArr[i], "relation");
      }

      // 識別操作
      for (let i = 0, len = this.operateArr.length; i < len; i++) {
        this.setKnownData(this.operateArr[i], "operate");
      }

      // 識別實體
      for (let i = 0, len = this.objectArr.length; i < len; i++) {
        this.setKnownData(this.objectArr[i], "object");
      }
      
      // 識別屬性
      for (let i = 0, len = this.attrArr.length; i < len; i++) {
        this.setKnownData(this.attrArr[i], "attr");
      }     
      
      // 根據獲取的數據來渲染高亮片段
      ...
    },

    // 設置已知數據，獲取的數據放到this.result中
    setKnownData(item, type, pwords, pindex) {
      let words = pwords ? pwords : this.words;
      let index = words.indexOf(item);
      let stringLen = item.length;

      pindex = pindex ? pindex : 0;

      if (index > -1) {
        // 構造標注需要的數據
        let data = {
          type,
          word: item,
          name: item,
          offset: [index + pindex, index + pindex + stringLen - 1],
          id: Math.ceil(new Date().getTime() * Math.random() * (index + 1))
        };

        // 添加到數據中，根據位置信息來判斷
        if (this.result[type].length === 0) {
          this.result[type].push(data);
        } else {
          let insertIndex = -1;
          this.result[type].find((unit, index) => {
            if (data.offset[0] <= unit.offset[1]) {
              insertIndex = index;
              return true;
            }
          });
          if (insertIndex > -1) {
            this.result[type].splice(insertIndex, 0, data);
          } else {
            this.result[type].push(data);
          }
        }

        // 繼續遍歷，可能會包含多個
        let word2 = words.substr(index + stringLen);
        this.setKnownData(item, type, word2, pindex + index + stringLen);
      }
    }

3、繪制不同顏色的矩形

svg.js繪制矩形的方法很簡單，需要確定的是繪制的矩形的寬高，位置即可，而這些信息根據字符的offset就可以算出來。在上面的數據中，我們在result中存了一些識別出來的數據。根據這些數據即可繪制不同顏色的矩形來了。

3.1 以數據為導向繪制圖形

 ...
 for (let i in this.result) {
   this.result[i].forEach(item => {
     this.sureMarkWord(item);
   });
 }

  // 確定標注數據，高亮文本，標注實體
   sureMarkWord(data) {
      // 根據坐標獲取字的信息
      let start = this.findWord(data.offset[0]);
      let end = this.findWord(data.offset[1]);
      if (!start || !end) {
        return;
      }

      let startRow = start.row;
      let endRow = end.row;
      let startIndex = start.index;
      let endIndex = end.index;

      // 同一行
      if (startRow == endRow) {        
        this.singleRowMark(start, end, data, endRow);
      } else { 
        //1,endRow從起始開始標注
        let start_endrow = this.dataChunk[endRow][0];
        this.singleRowMark(start_endrow, end, data, endRow);

        // endRow前面的行全部標注上
        for (let i = startRow; i < endRow; i++) {
          let len = this.dataChunk[i].length;
          let end_i = this.dataChunk[i][len - 1];
          if (i === startRow) {
            this.singleRowMark(start, end_i, data, startRow, true);
          } else {
            // 整行標注
            this.singleRowMark(
              this.dataChunk[i][0],
              end_i,
              data,
              i
            );
          }
        }
      }
    },

// 根據位置選擇文字
    findWord(offset) {
      let result = null;
      for (let i = 0, len = this.dataChunk.length; i < len; i++) {
        let item = this.dataChunk[i];
        for (let j = 0, len2 = item.length; j < len2; j++) {
          let unit = item[j];
          if (unit.offset === offset) {
            result = unit;
            break;
          }
        }

        if (result) {
          break;
        }
      }
      return result;
    },
  
    singleRowMark(start, end, data, row) {
      // 回調繪制chunk的矩形
      let width = end.dx + end.width - start.dx;
      let x = start.dx;
      let y = start.dy - this.wordHeight + 4;
      let height = this.wordHeight; // wordHeight是文本的高度，根據字體的大小設置，14px的定義為17
      let { name, type, word, id} = data;

      // 數據記錄
      let obj = {
        width,
        height,
        x,
        y,
        type,
        word,
        name,
        id,
        row,
        ry: y
      };      

      this.drawMarkGroups(obj);
    },

    // 文字底層顏色
    drawChunkRect(obj) {
      let { width, height, x, y, type, id, row, word} = obj;
      let color;
      color = this.wordColors[type]; // 根據類型的不同設置不同的顏色       
     
      let obj = {};
      // 記錄dom
      obj.rect = this.rectRows[row]
      .group()
      .rect(width, height)
      .move(x, y)
      .fill(isTemp ? "none" : color)
      .attr("id", id)
      .data("type", obj.type)
      .data("word", obj.word);
       this.wordRectDom[id].push(obj);
    }

至此，實現了划詞標注的顯示部分

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 讓html自動識別字符串中的換行符'\n' 換行輸出 IOS 實現TXT文本自動識別編碼的方法 canvas繪制文本自動換行 HTML中a標簽自動識別電話、郵箱 WebDriver中自動識別驗證碼--Python實現 Python 收貨地址自動識別（使用接口） WebDriver中自動識別驗證碼--Python實現圖片驗證碼自動識別，使用tess4j進行驗證碼自動識別(java實現) java自動識別用戶上傳的文本文件編碼 ArcGIS中應用Expressions標注(Label)之二—使用外部數據庫中數據標注要素