diff --git a/news_parser.js b/news_parser.js index 322aa9c..2d3bde3 100644 --- a/news_parser.js +++ b/news_parser.js @@ -113,26 +113,7 @@ for (var i in a_Elements) { let e = a_Elements[i]; - let content = ''; - if (e.querySelectorAll) { - var children = e.querySelectorAll("*"); - if (children.length == 0 || e.innerText) { - content += a_GrubTextFunc(e); - } - else { - for (let i = 0; i < children.length; i++) { - let c = children[i]; - content += a_GrubTextFunc(c); - } - } - } - - if (a_FinishWorkFunc) { - result += a_FinishWorkFunc(content, e); - } - else { - result += content; - } + result += a_FinishWorkFunc(e); } return result; } @@ -183,8 +164,57 @@ return GrubTextFunc } + function RemoveAllAttributes(a_Element) { + let new_el = document.createElement(a_Element.nodeName); + new_el.innerHTML = a_Element.innerHTML; + a_Element.outherHTML = new_el.outherHTML; + } + + function RemoveCurrentElementSaveChild(a_Element, a_Parent) { + var parent = a_Element.parentNode || a_Parent; + while(a_Element.firstChild) parent.insertBefore(a_Element.firstChild, a_Element); + parent.removeChild(a_Element); + } + + function GetClearHtml(a_Element, a_OutTag, a_TextAlign) { + let clear_element = a_Element.cloneNode(true); + let tags_to_delete = ['div', 'span', 'em', 'svg', 'path']; + + tags_to_delete.forEach(function (del_tag_name) { + let elements = clear_element.querySelectorAll(del_tag_name); + elements.forEach(function (element) { + RemoveCurrentElementSaveChild(element, clear_element); + }); + }); + + let elements = clear_element.querySelectorAll('*'); + elements.forEach(function (element) { + RemoveAllAttributes(element); + }); + + elements = clear_element.querySelectorAll('a'); + let re = new RegExp("(https?:\/\/.*)"); + elements.forEach(function (element) { + if (!re.test(element.pathname)) { + RemoveCurrentElementSaveChild(element, clear_element); + } + }); + + let result = clear_element.innerHTML; + if (a_OutTag && a_TextAlign) { + result = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + result + ''; + } + return result; + } + + function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) { - function FinishWorkFunc(a_Content, a_Element) { + function FinishWorkFunc(a_Element) { + let out_tag = a_OutTag; + if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) { + out_tag = 'h2'; + } + let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign) if (a_ClearTextFunc) { a_Content = a_ClearTextFunc(a_Content); } @@ -204,12 +234,9 @@ let c = childrens[i]; content += FinishWorkFunc(GrubTextFuncTemplate()(c), c); } - return content + return content; } - if (a_OutTag && a_TextAlign) { - a_Content = '<' + a_OutTag + ' style = "text-align:' + a_TextAlign + ';">' + a_Content + ''; - } if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') { a_Content = '
' + a_Content + '
'; } @@ -244,14 +271,15 @@ const grub_text_func = GrubTextFuncTemplate() if (location.hostname == 'tass.ru') { + // test: https://tass.ru/proisshestviya/19117971 const base_element = document.getElementById('content_box'); content = MakeContentByNews( base_element, base_element, base_element, - 'tass_pkg_title--variant_h1_default.*', + '(ArticleHeader_titles|tass_pkg_title--variant_h1_default).*', 'Image_wrapper_.*', - 'Paragraph_paragraph.*', + '(Paragraph_paragraph|Title_title).*', ElementCheckerTrue, ['/ТАСС/. '] );