java 去掉html標簽(Java中去掉網頁HTML標記的方法 )--正則表達式


參考:

http://www.cnblogs.com/newsouls/p/3995394.html

http://blog.csdn.net/he20101020/article/details/21228311

 

內容:

package utils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 注:\n 回車(\u000a)
 *    \t 水平制表符(\u0009)
 *    \s 空格(\u0008)
 *    \r 換行(\u000d)
 * Created by Administrator on 2017/7/14.
 */
public class HtmlUtil {
    public static void main(String[] args) {

        String str = "<div class=\"WB_cardwrap S_bg2\">\n" +
                "  <div class=\"search_feed\">\n" +
                "    <div class=\"person_list_feed clearfix\">\n" +
                "      <div class=\"pl_personlist\">\n" +
                "        <div class=\"list_person clearfix\">\n" +
                "          <div class=\"person_pic\">\n" +
                "            <a target=\"_blank\" href=\"http://weibo.com/114dotcom?refer_flag=1001030201_\" title=\"114導航\" suda-data=\"key=tblog_search_user&value=user_feed_1_icon\">\n" +
                "              <img class=\"W_face_radius\" src=\"http://tva4.sinaimg.cn/crop.176.129.505.505.180/006aCV5fgw1f0gxs4ikuyj30ne0juq47.jpg\" uid=\"5653836249\" height=\"80\" width=\"80\" /></a>\n" +
                "          </div>\n" +
                "          <div class=\"person_detail\">\n" +
                "            <p class=\"person_name\">\n" +
                "              <a class=\"W_texta W_fb\" target=\"_blank\" href=\"http://weibo.com/114dotcom?refer_flag=1001030201_\" title=\"114導航\" uid=\"5653836249\" suda-data=\"key=tblog_search_user&value=user_feed_1_name\">114導航</a>\n" +
                "              <a target=\"_blank\" href=\"http://verified.weibo.com/verify\" title=\"微博機構認證\" alt=\"微博機構認證\" class=\"W_icon icon_approve_co\"></a>\n" +
                "            </p>\n" +
                "            <p class=\"person_addr\">\n" +
                "              <span class=\"male m_icon\" title=\"男\"></span>\n" +
                "              <span>廣東</span>\n" +
                "              <a class=\"W_linkb\" target=\"_blank\" href=\"http://weibo.com/114dotcom?refer_flag=1001030201_\" class=\"wb_url\" suda-data=\"key=tblog_search_user&value=user_feed_1_url\">http://weibo.com/114dotcom</a></p>\n" +
                "            <p class=\"person_card\">\n" +
                "              <em class=\"red\">一一四網絡有限公司</em></p>\n" +
                "            <p class=\"person_num\">\n" +
                "              <span>關注\n" +
                "                <a class=\"W_linkb\" href=\"http://weibo.com/5653836249/follow?refer_flag=1001030201_\" target=\"_blank\" suda-data=\"key=tblog_search_user&value=user_feed_1_num\">68</a></span>\n" +
                "              <span>粉絲\n" +
                "                <a class=\"W_linkb\" href=\"http://weibo.com/5653836249/fans?refer_flag=1001030201_\" target=\"_blank\" suda-data=\"key=tblog_search_user&value=user_feed_1_num\">118</a></span>\n" +
                "              <span>微博\n" +
                "                <a class=\"W_linkb\" href=\"http://weibo.com/5653836249/profile?refer_flag=1001030201_\" target=\"_blank\" suda-data=\"key=tblog_search_user&value=user_feed_1_num\">7</a></span>\n" +
                "            </p>\n" +
                "            <div class=\"person_info\">\n" +
                "              <p>簡介: 114.com,不一樣的導航,能記住嗎?追求小清新,簡約到極致。推薦給您的不僅是網址,更是給你需要的答案。此外,114.com還提供電話、品牌、名人、價格等各種實用查詢。</p>\n" +
                "            </div>\n" +
                "            <p class=\"person_label\">標簽:\n" +
                "              <a class=\"W_linkb\" href=\"&tag=%25E6%2596%25B0%25E9%2597%25BB%25E7%2583%25AD%25E7%2582%25B9&Refer=SUer_tag\" suda-data=\"key=tblog_search_user&value=user_feed_1_label\">新聞熱點</a></p>\n" +
                "          </div>\n" +
                "        </div>\n" +
                "      </div>\n" +
                "    </div>\n" +
                "  </div>\n" +
                "</div>\n" +
                "<div class=\"WB_cardwrap S_bg2 relative\"></div>\n" +
                "<!-- 未登錄提示 -->\n" +
                "<div class=\"search_tips clearfix\">\n" +
                "  <p class=\"tips_co\">\n" +
                "    <span class=\"tips_icon icon_warn\"></span>\n" +
                "    <span class=\"tips_txt\">\n" +
                "      <a href=\"javascript:void(0);\" action-type=\"login\">立即登錄</a>查看更多結果。還沒有賬號?趕緊\n" +
                "      <a href=\"http://weibo.com/signup/signup.php?lang=zh-cn&amp;entry=weisousuo\" suda-data=\"key=tblog_search_v4.1&amp;value=nologin_reg\" target=\"_blank\">注冊微博</a></span>\n" +
                "  </p>\n" +
                "</div>\n" +
                "<!-- /未登錄提示 -->";

        System.out.println(delHTMLTag(str));
    }

    public static String delHTMLTag(String htmlStr){
        String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定義script的正則表達式
        String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定義style的正則表達式
        String regEx_html="<[^>]+>"; //定義HTML標簽的正則表達式
        String regEx_space = "\\s*|\t|\r|\n";//定義空格回車換行符

        Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
        Matcher m_script=p_script.matcher(htmlStr);
        htmlStr=m_script.replaceAll(""); //過濾script標簽

        Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
        Matcher m_style=p_style.matcher(htmlStr);
        htmlStr=m_style.replaceAll(""); //過濾style標簽

        Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
        Matcher m_html=p_html.matcher(htmlStr);
        htmlStr=m_html.replaceAll(""); //過濾html標簽

        Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
        Matcher m_space = p_space.matcher(htmlStr);
        htmlStr = m_space.replaceAll(""); // 過濾空格回車標簽

        return htmlStr.trim(); //返回文本字符串
    }

    public static String stripHtml(String content) {
// <p>段落替換為換行
        content = content.replaceAll("<p .*?>", "\r\n");
// <br><br/>替換為換行
        content = content.replaceAll("<br\\s*/?>", "\r\n");
// 去掉其它的<>之間的東西
        content = content.replaceAll("\\<.*?>", "");
// 還原HTML
// content = HTMLDecoder.decode(content);
        return content;
    }

    public static String getTextFromHtml(String htmlStr){
        htmlStr = delHTMLTag(htmlStr);
        htmlStr = htmlStr.replaceAll("&nbsp;", "");
        //htmlStr = htmlStr.substring(0, htmlStr.indexOf("。")+1);
        return htmlStr;
    }

}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM