C# :
public string RemoveHTML(string html)
{ html = Regex.Replace(html, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"-->", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<!--.*", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"&#(\d+);", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<img[^>]*>;", "", RegexOptions.IgnoreCase); html.Replace("<", ""); html.Replace(">", ""); html.Replace("\r\n", ""); return html; }public static string[] GetHtmlImageUrlList(string sHtmlText)
{ // 定义正则表达式用来匹配 img 标签 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText); int i = 0; string[] sUrlList = new string[matches.Count];// 取得匹配项列表
foreach (Match match in matches) sUrlList[i++] = match.Groups["imgUrl"].Value; return sUrlList; }
js:
function getimgsrc(htmlstr) {
var reg = /<img.+?src=('|")?([^'"]+)('|")?(?:\s+|>)/gim; var arr = []; while (tem = reg.exec(htmlstr)) { arr.push(tem[2]); } return arr; } function removeHTMLTag(str) { str = str.replace(/<\/?[^>]*>/g, ''); //去除HTML tag str = str.replace(/[ | ]*\n/g, '\n'); //去除行尾空白 //str = str.replace(/\n[\s| | ]*\r/g,'\n'); //去除多余空行 str = str.replace(/ /ig, ''); //去掉 return str; }