Browse Source

Исправлена работа в РГ и фильтрация первого предложения в ТАСС #3

master
parent
commit
4358be65c7
  1. 41
      news_parser.js

41
news_parser.js

@ -75,8 +75,11 @@
return RemoveAfterSplitter(a_Url, separator, false);
}
function ClearTextFuncTemplate(a_RemoveBeforeList) {
function ClearTextFunc(a_Content) {
function ClearTextFuncTemplate(a_RemoveBeforeList, a_OnlyFirstIndex) {
function ClearTextFunc(a_Content, a_ElementIndex) {
if (a_OnlyFirstIndex && a_ElementIndex && a_ElementIndex != 1) {
return a_Content;
}
let content = a_Content;
for (let i = 0; i < a_RemoveBeforeList.length; i++) {
let r = a_RemoveBeforeList[i];
@ -111,10 +114,12 @@
function GetContentInContainers(a_Elements, a_GrubTextFunc, a_FinishWorkFunc) {
let result = '';
let element_index = 1;
for (var i in a_Elements) {
let e = a_Elements[i];
result += a_FinishWorkFunc(e);
result += a_FinishWorkFunc(e, element_index);
element_index += 1;
}
return result;
}
@ -182,11 +187,17 @@
let elements = clear_element.querySelectorAll('*');
elements.forEach(function (element) {
RemoveAllAttributes(element);
element.removeAttribute('class');
if (element && element.parentNode && element.nodeName == 'DIV' && CheckRegExp(GetElementClassName, '.*(read-more|article__cover).*', element)) {
let for_rt_com = 'read-more|article__cover';
let for_rg_com = 'portal|rg-incut|article-img|Section';
let delete_in_rg = element.nodeName == 'RG-VIDEO' || element.nodeName == 'RG-INCUT';
if (element && (element.parentNode && CheckRegExp(GetElementClassName, '.*(' + for_rt_com + '|' + for_rg_com + ').*', element) || delete_in_rg)) {
element.parentNode.removeChild(element);
}
else {
RemoveAllAttributes(element);
element.removeAttribute('class');
element.removeAttribute('id');
}
});
let tags_to_delete = ['div', 'span', 'em', 'svg', 'path'];
@ -226,23 +237,14 @@
function FinishWorkFuncTemplate(a_OutTag, a_TextAlign, a_ClearTextFunc) {
function FinishWorkFunc(a_Element) {
function FinishWorkFunc(a_Element, a_ElementIndex) {
let out_tag = a_OutTag;
if (a_Element && CheckRegExp(GetElementClassName, 'Title_title.*', a_Element)) {
out_tag = 'h2';
}
let a_Content = GetClearHtml(a_Element, out_tag, a_TextAlign)
if (a_ClearTextFunc) {
a_Content = a_ClearTextFunc(a_Content);
}
if (a_Element && CheckRegExp(GetElementClassName, '(PageContentCommonStyling_text.*)', a_Element)) {
let content = '';
let childrens = FindElementsByRegExp(GetNodeName, '(P)', a_Element);
for (let i = 0; i < childrens.length; i++) {
let c = childrens[i];
content += FinishWorkFunc(c);
}
return content;
a_Content = a_ClearTextFunc(a_Content, a_ElementIndex);
}
if (a_Element && a_Element.dataset && a_Element.dataset.type == 'quote') {
@ -263,7 +265,7 @@
const grub_func = GrubTextFuncTemplate();
let content = '';
const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns));
const paragraph_finish_text_func = FinishWorkFuncTemplate(p_tag, 'justify', ClearTextFuncTemplate(a_ClearTextPatterns, true));
content += GetContentInContainers(FindElementsByRegExp(GetElementClassName, a_TitleRegExpElementPattern, a_BaseElementTitle), grub_func, title_finish_text_func);
content += GetImageInContainers(FindElementsByRegExp(GetElementClassName, a_ImageRegExpElementPattern, a_BaseElementImage), 'center');
content += GetContentInContainers(FIlterElements(FindElementsByRegExp(GetElementClassName, a_TextRegExpElementPattern, a_BaseElementText), a_ElementChecker), grub_func, paragraph_finish_text_func);
@ -289,7 +291,7 @@
'Image_wrapper_.*',
'(Paragraph_paragraph|Title_title).*',
ElementCheckerTrue,
['/ТАСС/. ']
['. ']
);
}
else if (location.hostname == 'ria.ru') {
@ -315,6 +317,7 @@
);
}
else if (location.hostname == 'rg.ru') {
// test: https://rg.ru/2023/10/28/volontery-iz-evrosoiuza-privezli-dlia-zhitelej-donbassa-20-tonn-gumanitarnogo-gruza.html
const base_element = document.getElementsByClassName('article__header')[0];
const base_element_text = document.getElementsByClassName('article__body')[0];

Loading…
Cancel
Save