众所周知,让用户在富文本编辑器中进行自己的输入绝对不是一个明智的选择,但是有的时候又没有办法,所以只有一条原则来保证系统的安全性,那就是我们让用户输入什么,用户才能输入什么,而不是用户想输入什么,他就能输入什么,这样才能让系统处于我们的掌控,不至于出现各种娄子,比如各种XSS注入什么的。
后来我们发现有一个比较好用的东西就是JSOUP,这是一个能够对输入的html进行过滤,简单来说就是可以增加白名单和黑名单(基于正则表达式),白名单就是只允许一个html标签上有固定的属性,比如我们只允许<div height="100" >,即div上只允许有height属性,其他的都是非法的我们认为,就可以用jsoup设置白名单进行过滤。我们也可以设置黑名单,即我们觉得<div>标签什么属性都可以有,但是style标签我们不能控制,认为他是个黑名单,我们也可以用jsoup进行实现。
jsonup实现的另一种方式:http://elf8848.iteye.com/blog/1872433
下面贴出一个样例:
import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.io.StringWriter;import java.io.Writer;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import net.sf.json.JSONObject;import net.sf.json.JsonConfig;import org.apache.commons.io.IOUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Document.OutputSettings;import org.jsoup.nodes.Element;import org.jsoup.safety.Whitelist;import org.jsoup.select.Elements;import org.springframework.core.io.ClassPathResource;import org.springframework.core.io.Resource;import org.springside.modules.mapper.JsonMapper;public class HTMLStringFilter { private final static String regxpForHtml = "<([^>]*)>"; // 过滤所有以 <开头以> 结尾的标签 private final static String PICTURE = "[图片]"; //private final static String regxpForImgTag = "<\\s*img\\s+([^>]*)\\s*>"; // 找出IMG标签 //private final static String regxpForImaTagSrcAttrib = "src=\"([^\"]+)\""; // 找出IMG标签的SRC属性 public HTMLStringFilter() { } public static String HTMLEncode(String fString){ fString=fString.replaceAll(" <", "<"); fString=fString.replaceAll(">", ">"); fString=fString.replaceAll(new String(new char[]{32}), " "); fString=fString.replaceAll(new String(new char[]{9}), " "); fString=fString.replaceAll(new String(new char[]{34}), """); fString=fString.replaceAll(new String(new char[]{39}), "'"); fString=fString.replaceAll(new String(new char[]{13}), ""); fString=fString.replaceAll(new String(new char[]{10,10}), ""); fString=fString.replaceAll(new String(new char[]{10}), "
"); return fString; } /** * xss escape */ public static String xssEscape(String input) { return input == null ? null : input.replaceAll("<", "<") .replaceAll(">", ">")// .replaceAll("eval\\((.*)\\)", "")// .replaceAll("[\"'][\\s]*((?i)javascript):(.*)[\"']", "\"\"")// .replaceAll("((?i)script)", "") ; } /** * 除指定标签之外的html标签编码 * @param str * @param tag * @return */ public static String xssEscapeExceptTag(String str,String tag) { String replaceTag="@"+tag+"@"; str=str.replaceAll("<"+tag,replaceTag ); str=xssEscape(str); str=str.replaceAll(replaceTag, "<"+tag); return str; } public static void main(String[] args){// System.out.println(new java.util.Date().getTime());// System.out.println(HTMLStringFilter.filterSafe("< script >ddd"));// System.out.println(HTMLStringFilter.filterSafe("< div >ddd"));// System.out.println("======"+HTMLStringFilter.filterSafe("< div oncliCk=''>ddd"));// // String imgHTML="";// String tag="img";// System.out.println("filter except:"+filterHtmlExceptTag(imgHTML, tag));// // System.out.println(new java.util.Date().getTime());// // String source="aaaaabbbbccc";// String title=replaceTag(source, "img", "alt");// System.out.println("title=="+title);// // String s="";// Listsrcs=match(source, "img", "src");// if (CollectionUtils.isNotEmpty(srcs)) {// for (String att : srcs) {// System.out.println("attr=="+att);// }// }// // System.out.println("html标签替换=="+replaceHtmlTagOfText(s, "img", "[图片]"));// String htmlStr="bb bb aaaa"; List srcs=getImgHTML(htmlStr); for (String src : srcs) { System.out.println("======="+src); }// System.out.println("=HTMLEncode=="+); // List htmls=getImgHTML(htmlStr);// List srcs=getImgSrc(htmlStr);// // System.out.println("--"+htmls.size()+"=="+srcs.size());// // for (String s : htmls) {// System.out.println("----"+s);// System.out.print(htmlStr.replaceFirst(s, "[图一]"));// }// for (String s : srcs) {// System.out.println("==="+s);// } } /** * 过滤一下字符串,连同前后< xxx >yyy< / xxx >全部消除。 * 不区分大小写、空格可识别 * "function", "window\\.", "javascript:", "script", * "js:", "about:", "file:", "document\\.", "vbs:", "frame", * "cookie", "onclick", "onfinish", "onmouse", "onexit=", * "onerror", "onclick", "onkey", "onload", "onfocus", "onblur" * @param htmlStr * @return */ public static String filterSafe(String htmlStr){ Pattern p = null; // 正则表达式 Matcher m = null; // 操作的字符串 StringBuffer tmp = null; String str = ""; boolean isHave = false; String[] Rstr = { "meta", "script", "object", "embed" }; if (htmlStr == null || !(htmlStr.length() > 0)) { return ""; } str = htmlStr.toLowerCase(); for (int i = 0; i < Rstr.length; i++) { p = Pattern.compile("<" + Rstr[i] + "(.[^>])*>"); m = p.matcher(str); tmp = new StringBuffer(); if (m.find()) { m.appendReplacement(tmp, "<" + Rstr[i] + ">"); while (m.find()) { m.appendReplacement(tmp, "<" + Rstr[i] + ">"); } isHave = true; } m.appendTail(tmp); str = tmp.toString(); p = Pattern.compile(" ])*>"); m = p.matcher(str); tmp = new StringBuffer(); if (m.find()) { m.appendReplacement(tmp, " "); while (m.find()) { m.appendReplacement(tmp, " "); } isHave = true; } m.appendTail(tmp); str = tmp.toString(); } // System.out.println(str); String[] Rstr1 = { "function", "window\\.", "javascript:", "script", "js:", "about:", "file:", "document\\.", "vbs:", "frame", "cookie", "onclick", "onfinish", "onmouse", "onexit=", "onerror", "onclick", "onkey", "onload", "onfocus", "onblur" }; for (int i = 0; i < Rstr1.length; i++) { p = Pattern.compile("<([^<>])*" + Rstr1[i] + "([^<>])*>([^<>])* ])*>"); m = p.matcher(str); tmp = new StringBuffer(); if (m.find()) { m.appendReplacement(tmp, ""); while (m.find()) { m.appendReplacement(tmp, ""); } isHave = true; } m.appendTail(tmp); str = tmp.toString(); } if (isHave) { htmlStr = str; } htmlStr = htmlStr.replaceAll("%3C", "<"); htmlStr = htmlStr.replaceAll("%3E", ">"); htmlStr = htmlStr.replaceAll("%2F", ""); htmlStr = htmlStr.replaceAll("&#", " &#"); return htmlStr; } /** * 采用jsoup白名单方式过滤非法的html字符。 * 原理: * 1.首先通过白名单过滤掉非法的html标签,即只允许输出白名单内的标签 * 2.对特殊的属性(主要是style)用正则过滤,只允许安全的属性值存在 * @param htmlStr 原始的html片段(用户通过富文本编辑器提交的html代码) * @return 过滤后的安全的html片段 */ public static String cleanSafeHtml(String htmlStr) { Document doc = Jsoup.parseBodyFragment(htmlStr); OutputSettings outSet = new OutputSettings(); outSet.prettyPrint(false); outSet.outline(false); doc.outputSettings(outSet); Map regexMap = initRegexMap(); if (regexMap != null) { for (Map.Entry entiy:regexMap.entrySet()){ String key = entiy.getKey(); Elements els = doc.select(key); for (Element el:els) { System.out.println("old el:"+el.toString()); String attribute = key.substring(key.indexOf("[")+1, key.indexOf("]")); String attributeValue = el.attr(attribute); Matcher valueMatcher = Pattern.compile(entiy.getValue()).matcher(attributeValue); if (valueMatcher.find()) { String safeValue = valueMatcher.group(); System.out.println("safeValue:"+safeValue); el.attr(attribute, safeValue); } System.out.println("new el:"+el.toString()); } } } Whitelist whitelist = initWhiteList(); String safeString = Jsoup.clean(doc.html(), "", whitelist); System.out.println("safestring:"+safeString); return safeString; // Elements els = doc.select("[style]");// for (Element el:els) {// System.out.println("old el:"+el.toString());// String styleattribute = el.attr("style");// Matcher styleMatcher = Pattern.compile(styleAttributeRegex).matcher(styleattribute);// if (styleMatcher.find()) {// String safeStyle = styleMatcher.group();// System.out.println("safeStyle:"+safeStyle);// el.attr("style", safeStyle);// }// System.out.println("new el:"+el.toString());// } // Whitelist whitelist = Whitelist.relaxed();// whitelist.addAttributes("span", "style");// String safeString = Jsoup.clean(doc.html(), "", whitelist);// System.out.println("safestring:"+safeString);// return safeString; } private static Whitelist whitelist = null; private static Whitelist initWhiteList() { if (whitelist == null) { synchronized(new Object()) { whitelist = new Whitelist(); String jsonString = null; Resource resource = new ClassPathResource("/data/whitelist.conf"); File file = null; InputStream input = null; Writer output = null; try { file = resource.getFile(); input = new FileInputStream(file); output = new StringWriter(); IOUtils.copy(input, output); jsonString = output.toString(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { if (input != null) { IOUtils.closeQuietly(input); } if (output != null) { IOUtils.closeQuietly(output); } } JsonConfig config = new JsonConfig(); config.setIgnoreDefaultExcludes(true);//这里不设置,会把class属性过滤掉 JSONObject jsonObject = JSONObject.fromObject(jsonString,config); JSONObject whitelistjson = jsonObject.getJSONObject("whiteList"); JSONObject protocolsjson = jsonObject.getJSONObject("protocols"); JsonMapper newMapper = new JsonMapper(); Map > whitelistmap = newMapper.fromJson(whitelistjson.toString(), HashMap.class); Map > protocolsmap = newMapper.fromJson(protocolsjson.toString(), HashMap.class); for (Map.Entry > entiy:whitelistmap.entrySet()){ String tag = entiy.getKey(); whitelist.addTags(tag); for (Map.Entry entiy2:entiy.getValue().entrySet()){ String attribute = entiy2.getKey(); whitelist.addAttributes(tag, attribute); System.out.println("value value:"+entiy2.getValue()); } } for (Map.Entry > entiy:protocolsmap.entrySet()){ String tag = entiy.getKey().substring(0, entiy.getKey().indexOf(".")); String key = entiy.getKey().substring(entiy.getKey().indexOf(".")+1, entiy.getKey().length()); for (String entiy2:entiy.getValue()){ whitelist.addProtocols(tag, key, entiy2); } } } } return whitelist; } private static Map regexMap = null; private static Map initRegexMap() { if (regexMap == null) { synchronized (new Object()) { regexMap = new HashMap (); String jsonString = null; Resource resource = new ClassPathResource("/data/whitelist.conf"); File file = null; InputStream input = null; Writer output = null; try { file = resource.getFile(); input = new FileInputStream(file); output = new StringWriter(); IOUtils.copy(input, output); jsonString = output.toString(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally { if (input != null) { IOUtils.closeQuietly(input); } if (output != null) { IOUtils.closeQuietly(output); } } JSONObject jsonObject = JSONObject.fromObject(jsonString); JSONObject whitelistjson = jsonObject.getJSONObject("whiteList"); JsonMapper newMapper = new JsonMapper(); Map > whitelistmap = newMapper.fromJson(whitelistjson.toString(), HashMap.class); for (Map.Entry > entiy:whitelistmap.entrySet()){ String tag = entiy.getKey(); for (Map.Entry entiy2:entiy.getValue().entrySet()){ String attribute = entiy2.getKey(); String attributeValue = entiy2.getValue(); if (attributeValue != null && attributeValue.trim().length() > 0) { regexMap.put(tag+"["+ attribute +"]", attributeValue); } } } } } return regexMap; } public static String filter(String input) { if (!hasSpecialChars(input)) { return input; } StringBuffer filtered = new StringBuffer(input.length()); char c; for (int i = 0; i <= input.length() - 1; i++) { c = input.charAt(i); switch (c) { case '<': filtered.append("<"); break; case '>': filtered.append(">"); break; case '"': filtered.append("&uot;"); break; case '&': filtered.append("&"); break; default: filtered.append(c); } } return (filtered.toString()); } public static boolean hasSpecialChars(String input) { boolean flag = false; if ((input != null) && (input.length() > 0)) { char c; for (int i = 0; i <= input.length() - 1; i++) { c = input.charAt(i); switch (c) { case '>': flag = true; break; case '<': flag = true; break; case '"': flag = true; break; case '&': flag = true; break; } } } return flag; } /** * * 基本功能:过滤所有以"<"开头以">"结尾的标签 * * * @param str * @return String */ public static String filterHtml(String str) { Pattern pattern = Pattern.compile(regxpForHtml); Matcher matcher = pattern.matcher(str); StringBuffer sb = new StringBuffer(); boolean result1 = matcher.find(); while (result1) { matcher.appendReplacement(sb, ""); result1 = matcher.find(); } matcher.appendTail(sb); return sb.toString(); } /** * 过滤除指定tag之外的html标签 * @param str * @param tag * @return */ public static String filterHtmlExceptTag(String str,String tag) { String replaceTag="@"+tag+"@"; str=str.replaceAll("<"+tag,replaceTag ); str=filterHtml(str); str=str.replaceAll(replaceTag, "<"+tag); return str; } /** * * 基本功能:过滤指定标签 *
* * @param str * @param tag * 指定标签 * @return String */ public static String fiterHtmlTag(String str, String tag) { String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>"; Pattern pattern = Pattern.compile(regxp); Matcher matcher = pattern.matcher(str); StringBuffer sb = new StringBuffer(); boolean result1 = matcher.find(); while (result1) { matcher.appendReplacement(sb, ""); result1 = matcher.find(); } matcher.appendTail(sb); return sb.toString(); } /** * * 基本功能:替换指定的标签 *
* * @param str * @param beforeTag * 要替换的标签 * @param tagAttrib * 要替换的标签属性值 * @param startTag * 新标签开始标记 * @param endTag * 新标签结束标记 * @return String * @如:替换img标签的src属性值为[img]属性值[/img] */ public static String replaceHtmlTag(String str, String beforeTag, String tagAttrib, String startTag, String endTag) { String regxpForTag = "<\\s*" + beforeTag + "\\s+([^>]*)\\s*>"; String regxpForTagAttrib = tagAttrib + "=\"([^\"]+)\""; Pattern patternForTag = Pattern.compile(regxpForTag); Pattern patternForAttrib = Pattern.compile(regxpForTagAttrib); Matcher matcherForTag = patternForTag.matcher(str); StringBuffer sb = new StringBuffer(); boolean result = matcherForTag.find(); while (result) { StringBuffer sbreplace = new StringBuffer(); Matcher matcherForAttrib = patternForAttrib.matcher(matcherForTag .group(1)); if (matcherForAttrib.find()) { matcherForAttrib.appendReplacement(sbreplace, startTag + matcherForAttrib.group(1) + endTag); } matcherForTag.appendReplacement(sb, sbreplace.toString()); result = matcherForTag.find(); } matcherForTag.appendTail(sb); return sb.toString(); } /** * html标签替换为指定字符 * @param str * @param tagAttrib * @param beforeTag * @param replace * @return */ public static String replaceHtmlTagOfText(String str,String tag,String text) { String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>"; Pattern pattern = Pattern.compile(regxp); Matcher matcher = pattern.matcher(str); StringBuffer sb = new StringBuffer(); boolean result1 = matcher.find(); while (result1) { matcher.appendReplacement(sb, text); result1 = matcher.find(); } matcher.appendTail(sb); return sb.toString(); } /** * 获取指定HTML标签的指定属性的值 * @param source 要匹配的源文本 * @param element 标签名称 * @param attr 标签的属性名称 * @return 属性值列表 */ public static List
match(String source, String element, String attr) { List result = new ArrayList (); String reg = "<" + element + "[^<>]*?\\s" + attr + "=['\"]?(.*?)['\"]?\\s.*?>"; Matcher m = Pattern.compile(reg).matcher(source); while (m.find()) { String r = m.group(1); result.add(r); } return result; } public static List getImgHTML(String html) { List resultList=new ArrayList (); Pattern p=Pattern.compile(" ]*)");// 结尾 Matcher m=p.matcher(html);//开始编译 while (m.find()) { resultList.add(" ");//获取匹配的部分 } return resultList; } public static List getImgSrc(String htmlStr){ String img=""; Pattern p_image; Matcher m_image; List pics = new ArrayList (); String regEx_img = " ]*?>"; //图片链接地址 p_image = Pattern.compile (regEx_img,Pattern.CASE_INSENSITIVE); m_image = p_image.matcher(htmlStr); while(m_image.find()){ img = m_image.group(); Matcher m = Pattern.compile("src=\"?(.*?)(\"|>|\\s+)").matcher(img); //匹配src while(m.find()){ pics.add(m.group(1)); } } return pics; } public static List getImgAlt(String htmlStr){ String img=""; Pattern p_image; Matcher m_image; List alts = new ArrayList (); String regEx_img = " ]*?>"; //图片链接地址 p_image = Pattern.compile (regEx_img,Pattern.CASE_INSENSITIVE); m_image = p_image.matcher(htmlStr); while(m_image.find()){ img = m_image.group(); Matcher m = Pattern.compile("alt=\"?(.*?)(\"|>|\\s+)").matcher(img); //匹配src while(m.find()){ alts.add(m.group(1)); } } return alts; } /** * * 基本功能:过滤所有以"<"开头以">"结尾的标签,但是替换为空格 * * * @param str * @return String */ public static String filterHtmlWithSapce(String str) { Pattern pattern = Pattern.compile(regxpForHtml); Matcher matcher = pattern.matcher(str); StringBuffer sb = new StringBuffer(); boolean result1 = matcher.find(); while (result1) { matcher.appendReplacement(sb, " "); result1 = matcher.find(); } matcher.appendTail(sb); return sb.toString(); } }
开头以>
并且贴出一个jsoup的白名单配置文件:
{"whiteList":{"a":{"href":"","title":""},"b":{},"blockquote":{"cite":""},"br":{},"caption":{},"cite":{},"code":{},"col":{"span":"","width":""},"colgroup":{"span":"","width":""},"dd":{},"div":{},"dl":{},"dt":{},"em":{},"h1":{},"h2":{},"h3":{},"h4":{},"h5":{},"h6":{},"i":{},"img":{"align":"", "alt":"", "height":"", "src":"", "title":"", "width":""},"li":{"class":"","style":"/^text-align:\\s*(left|right|center);?\\s*$/i"},"ol":{"start":"", "type":""},"p":{"style":"/^text-align:\\s*(left|right|center);?\\s*$/i"},"pre":{},"q":{"cite":""},"small":{},"span":{"style":"/^\\s*font-family\\s*:\\s*(('|\\\"|"|')?(楷体|楷体_GB2312|宋体|微软雅黑|黑体|,|\\s|\\w|sans-serif)('|\\\"|"|')?)+;?\\s*|\\s*(color|font-size|background-color)\\s*:\\s*(#\\w*|[\\w\\s]*|rgb\\s*\\(\\s*\\d+\\s*,\\s*\\d+\\s*,\\s*\\d+\\s*\\));?\\s*|\\s*text-decoration\\s*:\\s*(underline|overline|line-through|blink)\\s*;?\\s*$/i"},"strike":{},"strong":{},"sub":{},"sup":{},"table":{"summary":"", "width":""},"tbody":{},"td":{"abbr":"", "axis":"", "colspan":"", "rowspan":"", "width":""},"tfoot":{},"th":{"abbr":"", "axis":"", "colspan":"", "rowspan":"", "scope":"","width":""},"thead":{},"tr":{},"u":{},"ul":{"type":"","class":"","style":"/^list-style-type:\\s*(decimal|disc);\\s*$/i"}},"protocols":{"a.href":["ftp", "http", "https", "mailto"],"blockquote.cite":["http", "https"],"cite.cite":["http", "https"],"img.src":["http", "https"],"q.cite":["http", "https"]}}
即每个标签的任何属性,属性的值我们都可以进行过滤和定制。
这样,用户输入的任何东西都可以得到我们的控制。