Browse Source

Исправлена работа в РГ и фильтрация первого предложения в ТАСС #3

master
parent
commit
4358be65c7
  1. 41
      news_parser.js

41
news_parser.js

@ -75,8 +75,11 @@
return RemoveAfterSplitter(a_Url, separator, false); return RemoveAfterSplitter(a_Url, separator, false);
} }
function ClearTextFuncTemplate(a_RemoveBeforeList) { function ClearTextFuncTemplate(a_RemoveBeforeList, a_OnlyFirstIndex) {
function ClearTextFunc(a_Content) { function ClearTextFunc(a_Content, a_ElementIndex) {
if (a_OnlyFirstIndex && a_ElementIndex && a_ElementIndex != 1) {
return a_Content;
}
let content = a_Content; let content = a_Content;
for (let i = 0; i < a_RemoveBeforeList.length; i++) { for (let i = 0; i < a_RemoveBeforeList.length; i++) {
let r = a_RemoveBeforeList[i]; let r = a_RemoveBeforeList[i];
@ -111,10 +114,12 @@
function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) { function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) {
let result = ''; let result = '';
let element_index = 1;
for (var i in a_Elements) { for (var i in a_Elements) {
let e = a_Elements[i]; let e = a_Elements[i];
result += a_FinishWorkFunc(e); result += a_FinishWorkFunc(e, element_index);
element_index += 1;
} }
return result; return result;
} }
@ -182,11 +187,17 @@
let elements = clear_element.querySelectorAll('*'); let elements = clear_element.querySelectorAll('*');
elements.forEach(function (element) { elements.forEach(function (element) {
RemoveAllAttributes(element); let for_rt_com = 'read-more|article__cover';
element.removeAttribute('class'); let for_rg_com = 'portal|rg-incut|article-img|Section';
if (element && element.parentNode && element.nodeName == 'DIV' && CheckRegExp(GetElementClassName, '.*(read-more|article__cover).*', element)) { let delete_in_rg = element.nodeName == 'RG-VIDEO' || element.nodeName == 'RG-INCUT';
if (element && (element.parentNode && CheckRegExp(GetElementClassName, '.*(' + for_rt_com + '|' + for_rg_com + ').*', element) || delete_in_rg)) {
element.parentNode.removeChild(element); element.parentNode.removeChild(element);
} }
else {
RemoveAllAttributes(element);
element.removeAttribute('class');
element.removeAttribute('id');
}
}); });
let tags_to_delete = ['div', 'span', 'em', 'svg', 'path']; let tags_to_delete = ['div', 'span', 'em', 'svg', 'path'];
@ -226,23 +237,14 @@
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) { function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) {
function FinishWorkFunc(a_Element) { function FinishWorkFunc(a_Element, a_ElementIndex) {
let out_tag = a_OutTag; let out_tag = a_OutTag;
if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) { if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) {
out_tag = 'h2'; out_tag = 'h2';
} }
let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign) let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign)
if (a_ClearTextFunc) { if (a_ClearTextFunc) {
a_Content = a_ClearTextFunc(a_Content); a_Content = a_ClearTextFunc(a_Content, a_ElementIndex);
}
if (a_Element && CheckRegExp(GetElementClassName, '(PageContentCommonStyling_text.*)', a_Element)) {
let content = '';
let childrens = FindElementsByRegExp(GetNodeName, '(P)', a_Element);
for (let i = 0; i < childrens.length; i++) {
let c = childrens[i];
content += FinishWorkFunc(c);
}
return content;
} }
if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') { if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') {
@ -263,7 +265,7 @@
const grub_func = GrubTextFuncTemplate(); const grub_func = GrubTextFuncTemplate();
let content = ''; let content = '';
const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns)); const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns, true));
content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_func, title_finish_text_func); content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_func, title_finish_text_func);
content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center'); content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center');
content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_func, paragraph_finish_text_func); content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_func, paragraph_finish_text_func);
@ -289,7 +291,7 @@
'Image_wrapper_.*', 'Image_wrapper_.*',
'(Paragraph_paragraph|Title_title).*', '(Paragraph_paragraph|Title_title).*',
ElementCheckerTrue, ElementCheckerTrue,
['/ТАСС/. '] ['. ']
); );
} }
else if (location.hostname == 'ria.ru') { else if (location.hostname == 'ria.ru') {
@ -315,6 +317,7 @@
); );
} }
else if (location.hostname == 'rg.ru') { else if (location.hostname == 'rg.ru') {
// test: https://rg.ru/2023/10/28/volontery-iz-evrosoiuza-privezli-dlia-zhitelej-donbassa-20-tonn-gumanitarnogo-gruza.html
const base_element = document.getElementsByClassName('article__header')[0]; const base_element = document.getElementsByClassName('article__header')[0];
const base_element_text = document.getElementsByClassName('article__body')[0]; const base_element_text = document.getElementsByClassName('article__body')[0];

Loading…
Cancel
Save