From 03ca3aadc9b83ba76b8cd793dc76aa7ea1bc693e Mon Sep 17 00:00:00 2001 From: Alexei Bezborodov Date: Fri, 27 Oct 2023 22:45:22 +0300 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D1=8B=20=D0=B7=D0=B0=D0=B3=D0=B0=D0=BB=D0=BE=D0=B2=D0=BA?= =?UTF-8?q?=D0=B8=20=D1=82=D0=B0=D1=81=D1=81,=20=D0=B8=D1=81=D0=BF=D1=80?= =?UTF-8?q?=D0=B0=D0=B2=D0=BB=D0=B5=D0=BD=D0=B0=20=D0=BE=D1=87=D0=B8=D1=81?= =?UTF-8?q?=D1=82=D0=BA=D0=B0=20=D0=A5=D0=A2=D0=9C=D0=9B,=20=D0=BE=D1=81?= =?UTF-8?q?=D1=82=D0=B0=D1=8E=D1=82=D1=81=D1=8F=20=D1=82=D0=B5=D0=B3=D0=B8?= =?UTF-8?q?=20=D1=82=D0=B0=D0=B1=D0=BB=D0=B8=D1=86=20=D0=B8=20=D1=81=D0=BF?= =?UTF-8?q?=D0=B8=D1=81=D0=BA=D0=BE=D0=B2=20=D0=B8=20=D0=BC=D0=BD=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20=D1=87=D0=B5=D0=B3=D0=BE=20=D0=B5=D1=89=D1=91.?= =?UTF-8?q?=20#5=20#1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- news_parser.js | 82 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/news_parser.js b/news_parser.js index 322aa9c..2d3bde3 100644 --- a/news_parser.js +++ b/news_parser.js @@ -113,26 +113,7 @@ for (var i in a_Elements) { let e = a_Elements[i]; - let content = ''; - if (e.querySelectorAll) { - var children = e.querySelectorAll("*"); - if (children.length == 0 || e.innerText) { - content += a_GrubTextFunc(e); - } - else { - for (let i = 0; i < children.length; i++) { - let c = children[i]; - content += a_GrubTextFunc(c); - } - } - } - - if (a_FinishWorkFunc) { - result += a_FinishWorkFunc(content, e); - } - else { - result += content; - } + result += a_FinishWorkFunc(e); } return result; } @@ -183,8 +164,57 @@ return GrubTextFunc } + function RemoveAllAttributes(a_Element) { + let new_el = document.createElement(a_Element.nodeName); + new_el.innerHTML = a_Element.innerHTML; + a_Element.outherHTML = new_el.outherHTML; + } + + function RemoveCurrentElementSaveChild(a_Element, a_Parent) { + var parent = a_Element.parentNode || a_Parent; + while(a_Element.firstChild) parent.insertBefore(a_Element.firstChild, a_Element); + parent.removeChild(a_Element); + } + + function GetClearHtml(a_Element, a_OutTag, a_TextAlign) { + let clear_element = a_Element.cloneNode(true); + let tags_to_delete = ['div', 'span', 'em', 'svg', 'path']; + + tags_to_delete.forEach(function (del_tag_name) { + let elements = clear_element.querySelectorAll(del_tag_name); + elements.forEach(function (element) { + RemoveCurrentElementSaveChild(element, clear_element); + }); + }); + + let elements = clear_element.querySelectorAll('*'); + elements.forEach(function (element) { + RemoveAllAttributes(element); + }); + + elements = clear_element.querySelectorAll('a'); + let re = new RegExp("(https?:\/\/.*)"); + elements.forEach(function (element) { + if (!re.test(element.pathname)) { + RemoveCurrentElementSaveChild(element, clear_element); + } + }); + + let result = clear_element.innerHTML; + if (a_OutTag && a_TextAlign) { + result = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + result + ''; + } + return result; + } + + function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) { - function FinishWorkFunc(a_Content, a_Element) { + function FinishWorkFunc(a_Element) { + let out_tag = a_OutTag; + if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) { + out_tag = 'h2'; + } + let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign) if (a_ClearTextFunc) { a_Content = a_ClearTextFunc(a_Content); } @@ -204,12 +234,9 @@ let c = childrens[i]; content += FinishWorkFunc(GrubTextFuncTemplate()(c), c); } - return content + return content; } - if (a_OutTag && a_TextAlign) { - a_Content = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + a_Content + ''; - } if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') { a_Content = '
' + a_Content + '
'; } @@ -244,14 +271,15 @@ const grub_text_func = GrubTextFuncTemplate() if (location.hostname == 'tass.ru') { + // test: https://tass.ru/proisshestviya/19117971 const base_element = document.getElementById('content_box'); content = MakeContentByNews( base_element, base_element, base_element, - 'tass_pkg_title--variant_h1_default.*', + '(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*', 'Image_wrapper_.*', - 'Paragraph_paragraph.*', + '(Paragraph_paragraph|Title_title).*', ElementCheckerTrue, ['/ТАСС/. '] );