微博數據清洗(Java版)
大數據公益大學提供的一份數據,義務處理一下,原始數據是Excel,含有html標簽,如下:
要求清洗掉html標簽,和微博內容中的url地址。
主要分為兩部分:
1.處理文本,清洗數據。
2.處理excel讀寫操作。
上代碼:
ExcelUtil類,包含Excel2003-2007的讀寫操作,Excel使用Apache POI進行操作,需要jar包如下:
- package dat.datadeal;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Locale;
- import java.util.logging.Level;
- import java.util.logging.Logger;
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hssf.usermodel.HSSFCellStyle;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.ss.usermodel.Cell;
- import org.apache.poi.ss.usermodel.DateUtil;
- import org.apache.poi.ss.usermodel.Row;
- import org.apache.poi.ss.usermodel.Sheet;
- import org.apache.poi.ss.usermodel.Workbook;
- import org.apache.poi.xssf.usermodel.XSSFWorkbook;
- /**
- *
- * @author daT dev.tao@gmail.com
- *2003,2007版excel讀寫工具
- */
- public class ExcelUtil{
- /**
- * Excel文件讀取
- * @param filePath
- * @return String[]存的是行,List存的是列。
- * 一個excel一次全部讀入內存(Excel超大需要另行處理)
- */
- public List<String[]> readExcel(String filePath) {
- List<String[]> dataList = new ArrayList<String[]>();
- boolean isExcel2003 = true;
- if (isExcel2007(filePath)) {
- isExcel2003 = false;
- }
- File file = new File(filePath);
- InputStream is = null;
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- Workbook wb = null;
- try {
- wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);
- } catch (IOException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- Sheet sheet = wb.getSheetAt(0);
- int totalRows = sheet.getPhysicalNumberOfRows();
- int totalCells = 0;
- if (totalRows >= 1 && sheet.getRow(0) != null) {
- totalCells = sheet.getRow(0).getPhysicalNumberOfCells();
- }
- for (int r = 0; r < totalRows; r++) {
- Row row = sheet.getRow(r);
- if (row == null) {
- continue;
- }
- String[] rowList = new String[totalCells];
- for (int c = 0; c < totalCells; c++) {
- Cell cell = row.getCell(c);
- String cellValue = "";
- if (cell == null) {
- rowList[c] = (cellValue);
- continue;
- }
- cellValue = ConvertCellStr(cell, cellValue);
- rowList[c] = (cellValue);
- }
- dataList.add(rowList);
- }
- return dataList;
- }
- private String ConvertCellStr(Cell cell, String cellStr) {
- switch (cell.getCellType()) {
- case Cell.CELL_TYPE_STRING:
- // 讀取String
- cellStr = cell.getStringCellValue().toString();
- break;
- case Cell.CELL_TYPE_BOOLEAN:
- // 得到Boolean對象的方法
- cellStr = String.valueOf(cell.getBooleanCellValue());
- break;
- case Cell.CELL_TYPE_NUMERIC:
- // 先看是否是日期格式
- if (DateUtil.isCellDateFormatted(cell)) {
- // 讀取日期格式
- cellStr = formatTime(cell.getDateCellValue().toString());
- } else {
- // 讀取數字
- cellStr = String.valueOf(cell.getNumericCellValue());
- }
- break;
- case Cell.CELL_TYPE_FORMULA:
- // 讀取公式
- cellStr = cell.getCellFormula().toString();
- break;
- }
- return cellStr;
- }
- private boolean isExcel2007(String fileName) {
- return fileName.matches("^.+\\.(?i)(xlsx)$");
- }
- private String formatTime(String s) {
- SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);
- Date date = null;
- try {
- date = sf.parse(s);
- } catch (ParseException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- String result = sdf.format(date);
- return result;
- }
- /**
- * Excel寫操作,簡單起見還是采用內存數據一次寫入
- * @param filePath 輸出文件路徑名
- * @param dataList 輸出文件內容,List<String>行 List列
- * @throws IOException
- */
- public void writeExcel(String filePath,List<List<String>> dataList) throws IOException{
- HSSFWorkbook wb = new HSSFWorkbook();
- HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet
- // 表格樣式
- HSSFCellStyle style = wb.createCellStyle();
- style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定單元格居中對齊
- // // 邊框
- // style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM);
- // style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM);
- // style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM);
- // style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM);
- // //設置字體
- // HSSFFont f = wb.createFont();
- // f.setFontHeightInPoints((short)10);
- // f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL);
- // style.setFont(f);
- // //設置列寬
- // sheet.setColumnWidth((short)0, (short)9600);
- // sheet.setColumnWidth((short)1, (short)4000);
- // sheet.setColumnWidth((short)2, (short)8000);
- // sheet.setColumnWidth((short)3, (short)8000);
- // 在索引0的位置創建第一行
- for (int i = 0; i < dataList.size(); i++) {
- HSSFRow row = sheet.createRow(i);
- List<String> list = dataList.get(i);
- for (int j = 0; j < list.size(); j++) {
- HSSFCell cell = row.createCell(j);
- cell.setCellValue(list.get(j));
- cell.setCellStyle(style);
- }
- }
- // 導出文件
- FileOutputStream fout = new FileOutputStream(filePath);
- wb.write(fout);
- fout.close();
- }
- }
DataClean類,包含對html標簽,信息中url的的清洗。
- package dat.datadeal;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- *
- * @author daT dev.tao@gmail.com
- *
- */
- public class DataClean {
- /**
- * 清洗html標簽
- * @param inputString
- * @return
- */
- public static String delHtml(String inputString) {
- String htmlStr = inputString; // 含html標簽的字符串
- String textStr = "";
- java.util.regex.Pattern p_script;
- java.util.regex.Matcher m_script;
- java.util.regex.Pattern p_html;
- java.util.regex.Matcher m_html;
- try {
- String regEx_html = "<[^>]+>"; // 定義HTML標簽的正則表達式
- String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定義script的正則表達式{或<script[^>]*?>[/s/S]*?<//script>
- p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
- m_script = p_script.matcher(htmlStr);
- htmlStr = m_script.replaceAll(""); // 過濾script標簽
- p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
- m_html = p_html.matcher(htmlStr);
- htmlStr = m_html.replaceAll(""); // 過濾html標簽
- textStr = htmlStr;
- } catch (Exception e) {
- System.err.println("Html2Text: " + e.getMessage());
- }
- return textStr;// 返回文本字符串
- }
- /**
- * 處理掉信息中的url地址
- */
- public static String dealWithUrl(String str){
- String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(str);
- return m.replaceAll("");
- }
- public static void main(String[] args) throws IOException{
- ExcelUtil excelUtil = new ExcelUtil();
- List<List<String>> writeList = new ArrayList<List<String>>();
- List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博數據_.xlsx");
- for(String[] lineArray:readList){
- List<String> strList = new ArrayList<String>();
- for(String str:lineArray){
- String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));
- strList.add(strTmp);
- //System.out.println(strTmp);
- }
- writeList.add(strList);
- }
- excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);
- System.out.println("job has finished...........");
- }
- }
清洗后數據: