Html批量轉csv

本文轉載自查看原文 2020-09-15 14:27 599 數據文件處理

不知道大家又沒有遇到這樣的問題，某些業務系統有導出數據功能，導出的數據都是存放在excel表格里面，需要批量轉csv，

但是這樣的文件不是標准的excel文檔，本質是html文檔

比如說，系統導出的文檔是這樣的

從這里我們可以看出來，感覺就是一個普通的excel文檔，通過office也能正常打開，但是你通過編寫代碼批量轉csv的時候，就出問題

我也是在無意中發現這不是標准的excle文檔，我們通過文檔編輯器打開試試

這明顯就是html文件，只能怪這個業務系統的開發人員不夠嚴謹了，現在需要我們來解決這樣的問題

我們先在idea里面創建一個maven項目

package com.gong;


import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Jsoup解析html標簽時類似於JQuery的一些符號
 *
 * @author chixh
 *
 */
public class HtmlParser {
    protected List<List<String>> data = new LinkedList<List<String>>();

    /**
     * 獲取value值
     *
     * @param e
     * @return
     */
    public static String getValue(Element e) {
        return e.attr("value");
    }

    /**
     * 獲取
     * <tr>
     * 和
     * </tr>
     * 之間的文本
     *
     * @param e
     * @return
     */
    public static String getText(Element e) {
        return e.text();
    }

    /**
     * 識別屬性id的標簽,一般一個html頁面id唯一
     *
     * @param body
     * @param id
     * @return
     */
    public static Element getID(String body, String id) {
        Document doc = Jsoup.parse(body);
        // 所有#id的標簽
        Elements elements = doc.select("#" + id);
        // 返回第一個
        return elements.first();
    }

    /**
     * 識別屬性class的標簽
     *
     * @param body
     * @param class
     * @return
     */
    public static Elements getClassTag(String body, String classTag) {
        Document doc = Jsoup.parse(body);
        // 所有#id的標簽
        return doc.select("." + classTag);
    }

    /**
     * 獲取tr標簽元素組
     *
     * @param e
     * @return
     */
    public static Elements getTR(Element e) {
        return e.getElementsByTag("tr");
    }

    /**
     * 獲取td標簽元素組
     *
     * @param e
     * @return
     */
    public static Elements getTD(Element e) {
        return e.getElementsByTag("td");
    }
    /**
     * 獲取表元組
     * @param table
     * @return
     */
    public static List<List<String>> getTables(Element table){
        List<List<String>> data = new ArrayList<>();

        for (Element etr : table.select("tr")) {
            List<String> listh=new ArrayList<>();
            //獲取表頭
            for(Element eth : etr.select("th")){
                String th=eth.text();
                listh.add(th);
            }
            if(!listh.isEmpty()) {
                data.add(listh);
            }
            List<String> list = new ArrayList<>();
            for (Element etd : etr.select("td")) {
                String temp = etd.text();
                //增加一行中的一列
                list.add(temp);
            }
            //增加一行
            if(!list.isEmpty()) {
                data.add(list);
            }
        }
        return data;
    }
    /**
     * 讀html文件
     * @param fileName
     * @return
     */
    public static String readHtml(String fileName){
        FileInputStream fis = null;
        StringBuffer sb = new StringBuffer();
        try {
            fis = new FileInputStream(fileName);
            byte[] bytes = new byte[1024];
            while (-1 != fis.read(bytes)) {
                sb.append(new String(bytes));
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                fis.close();
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        return sb.toString();
    }

    public static void getFileName(String inputexecl,String outputcsv){
       // Document doc2 = Jsoup.parse(readHtml("E:\\datas\\小組成員學習情況統計(11).xls"));
        String path = inputexecl;
        File f = new File(path);
        if (!f.exists()) {
            System.out.println(path + " not exists");
            return;
        }

        File fa[] = f.listFiles();//獲取該目錄下所有文件和目錄的絕對路徑
        for (int i = 0; i < fa.length; i++) {
            File fs = fa[i];
            if (fs.isDirectory()) {
                System.out.println(fs.getName() + " [目錄]");
            } else{
                String filepath= String.valueOf(fs);
                Document doc2 = Jsoup.parse(readHtml(filepath));
                Element table = doc2.select("table").first();
                //獲取table表的內容，存放到List集合里面
                List<List<String>> list = getTables(table);
                for (List<String> list2 : list) {
                    for (String string : list2) {
                        System.out.print(string+",");
                    }
                    System.out.println();
                }
                String name= StringUtils.substringBeforeLast(fs.getName(),".");//獲取文件名字部分
                //String newFilePath="E:\\datas\\csv\\小組成員學習.csv";
                String newFilePath=outputcsv+name+".csv";
                String savePath = newFilePath;
                File saveCSV = new File(savePath);
                String buffer="";
                try {
                    if(!saveCSV.exists())
                        saveCSV.createNewFile();
                    OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(saveCSV ),"UTF-8");
                    BufferedWriter writer = new BufferedWriter(write);
                    for(int j=0;j<list.size();j++){
                        List<String> list1=new ArrayList<String>();
                        buffer=list.get(j).toString();
                        System.out.println(buffer);
                        buffer = buffer.substring(1, buffer.lastIndexOf("]")).toString();
                        list1.add(buffer);
                        writer.write(buffer);
                        writer.newLine();
                    }
                    writer.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }
    public static void main(String[] args) {
        System.out.println("請輸入Execl數據所在路徑");
        Scanner execl=new Scanner(System.in);
        String input=execl.nextLine(); //獲取execl輸入路徑
        System.out.println("請輸入csv文件數據的輸出路徑");
        Scanner csv=new Scanner(System.in);
        String  output = csv.nextLine();
        getFileName(input,output);

    }

}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.gong</groupId>
  <artifactId>csv</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>csv</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <dependency>
                 <groupId>net.sf.opencsv</groupId>
                 <artifactId>opencsv</artifactId>
                 <version>2.1</version>
             </dependency>
           <dependency>
                <groupId>org.apache.poi</groupId>
               <artifactId>ooxml-schemas</artifactId>
                <version>1.1</version>
               <type>pom</type>
            </dependency>
           <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi</artifactId>
                <version>3.7</version>
           </dependency>
           <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>ooxml-schemas</artifactId>
                <version>1.1</version>
            </dependency>
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
                <version>3.7</version>
            </dependency>
            <dependency>
                <groupId>dom4j</groupId>
                <artifactId>dom4j</artifactId>
                <version>1.6.1</version>
            </dependency>
      <!-- https://mvnrepository.com/artifact/net.sourceforge.jexcelapi/jxl -->
      <dependency>
          <groupId>net.sourceforge.jexcelapi</groupId>
          <artifactId>jxl</artifactId>
          <version>2.6.12</version>
      </dependency>
      <dependency>
          <groupId>commons-lang</groupId>
          <artifactId>commons-lang</artifactId>
          <version>2.6</version>
      </dependency>
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.11.3</version>
      </dependency>

  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>

    </pluginManagement>
  </build>
</project>

運行分別輸入excel文檔的目錄和csv的輸出目錄就可以了，在這里提醒一下大家，如果使用我這段代碼的話，excel文檔的數據文件不能帶有其他類型的文件。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 文件讀寫與csv轉html csv批量導入mysql命令 CSV轉Excel格式 csv文件轉化為HTML postman導入csv文件，批量運行 Python實現對csv的批量處理並保存 JAVA代碼csv轉excel CSV文件轉EXCEl（java） excel批量轉換為CSV格式，xls批量導出csv格式 oracle批量update 轉