不知道大家又沒有遇到這樣的問題,某些業務系統有導出數據功能,導出的數據都是存放在excel表格里面,需要批量轉csv,
但是這樣的文件不是標准的excel文檔,本質是html文檔
比如說,系統導出的文檔是這樣的
從這里我們可以看出來,感覺就是一個普通的excel文檔,通過office也能正常打開,但是你通過編寫代碼批量轉csv的時候,就出問題
我也是在無意中發現這不是標准的excle文檔,我們通過文檔編輯器打開試試
這明顯就是html文件,只能怪這個業務系統的開發人員不夠嚴謹了,現在需要我們來解決這樣的問題
我們先在idea里面創建一個maven項目
package com.gong; import java.io.*; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Scanner; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * Jsoup解析html標簽時類似於JQuery的一些符號 * * @author chixh * */ public class HtmlParser { protected List<List<String>> data = new LinkedList<List<String>>(); /** * 獲取value值 * * @param e * @return */ public static String getValue(Element e) { return e.attr("value"); } /** * 獲取 * <tr> * 和 * </tr> * 之間的文本 * * @param e * @return */ public static String getText(Element e) { return e.text(); } /** * 識別屬性id的標簽,一般一個html頁面id唯一 * * @param body * @param id * @return */ public static Element getID(String body, String id) { Document doc = Jsoup.parse(body); // 所有#id的標簽 Elements elements = doc.select("#" + id); // 返回第一個 return elements.first(); } /** * 識別屬性class的標簽 * * @param body * @param class * @return */ public static Elements getClassTag(String body, String classTag) { Document doc = Jsoup.parse(body); // 所有#id的標簽 return doc.select("." + classTag); } /** * 獲取tr標簽元素組 * * @param e * @return */ public static Elements getTR(Element e) { return e.getElementsByTag("tr"); } /** * 獲取td標簽元素組 * * @param e * @return */ public static Elements getTD(Element e) { return e.getElementsByTag("td"); } /** * 獲取表元組 * @param table * @return */ public static List<List<String>> getTables(Element table){ List<List<String>> data = new ArrayList<>(); for (Element etr : table.select("tr")) { List<String> listh=new ArrayList<>(); //獲取表頭 for(Element eth : etr.select("th")){ String th=eth.text(); listh.add(th); } if(!listh.isEmpty()) { data.add(listh); } List<String> list = new ArrayList<>(); for (Element etd : etr.select("td")) { String temp = etd.text(); //增加一行中的一列 list.add(temp); } //增加一行 if(!list.isEmpty()) { data.add(list); } } return data; } /** * 讀html文件 * @param fileName * @return */ public static String readHtml(String fileName){ FileInputStream fis = null; StringBuffer sb = new StringBuffer(); try { fis = new FileInputStream(fileName); byte[] bytes = new byte[1024]; while (-1 != fis.read(bytes)) { sb.append(new String(bytes)); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { fis.close(); } catch (IOException e1) { e1.printStackTrace(); } } return sb.toString(); } public static void getFileName(String inputexecl,String outputcsv){ // Document doc2 = Jsoup.parse(readHtml("E:\\datas\\小組成員學習情況統計(11).xls")); String path = inputexecl; File f = new File(path); if (!f.exists()) { System.out.println(path + " not exists"); return; } File fa[] = f.listFiles();//獲取該目錄下所有文件和目錄的絕對路徑 for (int i = 0; i < fa.length; i++) { File fs = fa[i]; if (fs.isDirectory()) { System.out.println(fs.getName() + " [目錄]"); } else{ String filepath= String.valueOf(fs); Document doc2 = Jsoup.parse(readHtml(filepath)); Element table = doc2.select("table").first(); //獲取table表的內容,存放到List集合里面 List<List<String>> list = getTables(table); for (List<String> list2 : list) { for (String string : list2) { System.out.print(string+","); } System.out.println(); } String name= StringUtils.substringBeforeLast(fs.getName(),".");//獲取文件名字部分 //String newFilePath="E:\\datas\\csv\\小組成員學習.csv"; String newFilePath=outputcsv+name+".csv"; String savePath = newFilePath; File saveCSV = new File(savePath); String buffer=""; try { if(!saveCSV.exists()) saveCSV.createNewFile(); OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(saveCSV ),"UTF-8"); BufferedWriter writer = new BufferedWriter(write); for(int j=0;j<list.size();j++){ List<String> list1=new ArrayList<String>(); buffer=list.get(j).toString(); System.out.println(buffer); buffer = buffer.substring(1, buffer.lastIndexOf("]")).toString(); list1.add(buffer); writer.write(buffer); writer.newLine(); } writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } public static void main(String[] args) { System.out.println("請輸入Execl數據所在路徑"); Scanner execl=new Scanner(System.in); String input=execl.nextLine(); //獲取execl輸入路徑 System.out.println("請輸入csv文件數據的輸出路徑"); Scanner csv=new Scanner(System.in); String output = csv.nextLine(); getFileName(input,output); } }
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.gong</groupId> <artifactId>csv</artifactId> <version>1.0-SNAPSHOT</version> <name>csv</name> <!-- FIXME change it to the project's website --> <url>http://www.example.com</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.7</maven.compiler.source> <maven.compiler.target>1.7</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <dependency> <groupId>net.sf.opencsv</groupId> <artifactId>opencsv</artifactId> <version>2.1</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> <type>pom</type> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.7</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.7</version> </dependency> <dependency> <groupId>dom4j</groupId> <artifactId>dom4j</artifactId> <version>1.6.1</version> </dependency> <!-- https://mvnrepository.com/artifact/net.sourceforge.jexcelapi/jxl --> <dependency> <groupId>net.sourceforge.jexcelapi</groupId> <artifactId>jxl</artifactId> <version>2.6.12</version> </dependency> <dependency> <groupId>commons-lang</groupId> <artifactId>commons-lang</artifactId> <version>2.6</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> <build> <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) --> <plugins> <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle --> <plugin> <artifactId>maven-clean-plugin</artifactId> <version>3.1.0</version> </plugin> <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging --> <plugin> <artifactId>maven-resources-plugin</artifactId> <version>3.0.2</version> </plugin> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.8.0</version> </plugin> <plugin> <artifactId>maven-surefire-plugin</artifactId> <version>2.22.1</version> </plugin> <plugin> <artifactId>maven-jar-plugin</artifactId> <version>3.0.2</version> </plugin> <plugin> <artifactId>maven-install-plugin</artifactId> <version>2.5.2</version> </plugin> <plugin> <artifactId>maven-deploy-plugin</artifactId> <version>2.8.2</version> </plugin> <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle --> <plugin> <artifactId>maven-site-plugin</artifactId> <version>3.7.1</version> </plugin> <plugin> <artifactId>maven-project-info-reports-plugin</artifactId> <version>3.0.0</version> </plugin> </plugins> </pluginManagement> </build> </project>
運行分別輸入excel文檔的目錄和csv的輸出目錄就可以了,在這里提醒一下大家,如果使用我這段代碼的話,excel文檔的數據文件不能帶有其他類型的文件。