From 4358be65c7655ea97ecfa16601c9b910c2f90d48 Mon Sep 17 00:00:00 2001 From: Alexei Bezborodov Date: Sat, 28 Oct 2023 14:44:29 +0300 Subject: [PATCH] =?UTF-8?q?=D0=98=D1=81=D0=BF=D1=80=D0=B0=D0=B2=D0=BB?= =?UTF-8?q?=D0=B5=D0=BD=D0=B0=20=D1=80=D0=B0=D0=B1=D0=BE=D1=82=D0=B0=20?= =?UTF-8?q?=D0=B2=20=D0=A0=D0=93=20=D0=B8=20=D1=84=D0=B8=D0=BB=D1=8C=D1=82?= =?UTF-8?q?=D1=80=D0=B0=D1=86=D0=B8=D1=8F=20=D0=BF=D0=B5=D1=80=D0=B2=D0=BE?= =?UTF-8?q?=D0=B3=D0=BE=20=D0=BF=D1=80=D0=B5=D0=B4=D0=BB=D0=BE=D0=B6=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D1=8F=20=D0=B2=20=D0=A2=D0=90=D0=A1=D0=A1=20#3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- news_parser.js | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/news_parser.js b/news_parser.js index 6472152..11869a9 100644 --- a/news_parser.js +++ b/news_parser.js @@ -75,8 +75,11 @@ return RemoveAfterSplitter(a_Url, separator, false); } - function ClearTextFuncTemplate(a_RemoveBeforeList) { - function ClearTextFunc(a_Content) { + function ClearTextFuncTemplate(a_RemoveBeforeList, a_OnlyFirstIndex) { + function ClearTextFunc(a_Content, a_ElementIndex) { + if (a_OnlyFirstIndex && a_ElementIndex && a_ElementIndex != 1) { + return a_Content; + } let content = a_Content; for (let i = 0; i < a_RemoveBeforeList.length; i++) { let r = a_RemoveBeforeList[i]; @@ -111,10 +114,12 @@ function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) { let result = ''; + let element_index = 1; for (var i in a_Elements) { let e = a_Elements[i]; - result += a_FinishWorkFunc(e); + result += a_FinishWorkFunc(e, element_index); + element_index += 1; } return result; } @@ -182,11 +187,17 @@ let elements = clear_element.querySelectorAll('*'); elements.forEach(function (element) { - RemoveAllAttributes(element); - element.removeAttribute('class'); - if (element && element.parentNode && element.nodeName == 'DIV' && CheckRegExp(GetElementClassName, '.*(read-more|article__cover).*', element)) { + let for_rt_com = 'read-more|article__cover'; + let for_rg_com = 'portal|rg-incut|article-img|Section'; + let delete_in_rg = element.nodeName == 'RG-VIDEO' || element.nodeName == 'RG-INCUT'; + if (element && (element.parentNode && CheckRegExp(GetElementClassName, '.*(' + for_rt_com + '|' + for_rg_com + ').*', element) || delete_in_rg)) { element.parentNode.removeChild(element); } + else { + RemoveAllAttributes(element); + element.removeAttribute('class'); + element.removeAttribute('id'); + } }); let tags_to_delete = ['div', 'span', 'em', 'svg', 'path']; @@ -226,23 +237,14 @@ function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) { - function FinishWorkFunc(a_Element) { + function FinishWorkFunc(a_Element, a_ElementIndex) { let out_tag = a_OutTag; if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) { out_tag = 'h2'; } let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign) if (a_ClearTextFunc) { - a_Content = a_ClearTextFunc(a_Content); - } - if (a_Element && CheckRegExp(GetElementClassName, '(PageContentCommonStyling_text.*)', a_Element)) { - let content = ''; - let childrens = FindElementsByRegExp(GetNodeName, '(P)', a_Element); - for (let i = 0; i < childrens.length; i++) { - let c = childrens[i]; - content += FinishWorkFunc(c); - } - return content; + a_Content = a_ClearTextFunc(a_Content, a_ElementIndex); } if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') { @@ -263,7 +265,7 @@ const grub_func = GrubTextFuncTemplate(); let content = ''; - const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns)); + const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns, true)); content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_func, title_finish_text_func); content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center'); content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_func, paragraph_finish_text_func); @@ -289,7 +291,7 @@ 'Image_wrapper_.*', '(Paragraph_paragraph|Title_title).*', ElementCheckerTrue, - ['/ТАСС/. '] + ['. '] ); } else if (location.hostname == 'ria.ru') { @@ -315,6 +317,7 @@ ); } else if (location.hostname == 'rg.ru') { + // test: https://rg.ru/2023/10/28/volontery-iz-evrosoiuza-privezli-dlia-zhitelej-donbassa-20-tonn-gumanitarnogo-gruza.html const base_element = document.getElementsByClassName('article__header')[0]; const base_element_text = document.getElementsByClassName('article__body')[0];