// ==UserScript==
// @name News parser
// @namespace http://zakonvremeni.ru
// @version 0.2
// @description Parse news
// @author AlexeiBv+mirocod@narod.ru
// @match https://tass.ru/*
// @match https://ria.ru/*
// @match https://zakonvremeni.ru/*
// @icon https://icons.duckduckgo.com/ip2/zakonvremeni.ru.ico
// @grant none
// ==/UserScript==
// Общественное достояние, 2023, Алексей Безбородов (Alexei Bezborodov) <AlexeiBv+mirocod_news_parser@narod.ru>
( function ( ) {
'use strict' ;
// Поиск элементов по регулярному выражению
function GetElementClassName ( a _Element ) {
return a _Element . className ;
}
function FindElementsByRegExp ( a _GenElementNameFunc , a _RegExpPattern , a _ElementParent ) {
a _ElementParent || ( a _ElementParent = document ) ;
var descendants = a _ElementParent . getElementsByTagName ( '*' ) , i = - 1 , e , result = [ ] ;
var re = new RegExp ( "(?:^|\\s)" + a _RegExpPattern + "(?!\\S)" ) ;
while ( e = descendants [ ++ i ] ) {
if ( re . test ( a _GenElementNameFunc ( e ) ) ) {
result . push ( e ) ;
}
}
return result ;
}
// Работа со строками
function TrimString ( s ) {
return ( s || '' ) . replace ( /^\s+|\s+$/g , '' ) ;
}
function RemoveBeforeSplitter ( a _String , a _Splitter ) {
var index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
return a _String . substring ( index + a _Splitter . length ) ;
}
return a _String ;
}
function RemoveAfterSplitter ( a _String , a _Splitter , a _SaveSplitter ) {
var index = a _String . indexOf ( a _Splitter )
if ( index != - 1 ) {
var spl _len = a _Splitter . length
if ( ! a _SaveSplitter ) {
spl _len = 0 ;
}
return a _String . substring ( 0 , index + spl _len ) ;
}
return a _String ;
}
function FinishWorkFuncTemplate ( a _OutTag , a _TextAlign , a _ClearTextFunc ) {
function FinishWorkFunc ( a _Content , a _Element ) {
if ( a _ClearTextFunc ) {
a _Content = a _ClearTextFunc ( a _Content ) ;
}
if ( a _OutTag && a _TextAlign ) {
return '<' + a _OutTag + ' style = "text-align:' + a _TextAlign + ';">' + a _Content + '</' + a _OutTag + '>' ;
}
else {
return a _Content ;
}
}
return FinishWorkFunc
}
function ClearUrl ( a _Url ) {
var separator = '?' ;
return RemoveAfterSplitter ( a _Url , separator , false ) ;
}
function ClearTextFuncTemplate ( a _RemoveBeforeList ) {
function ClearTextFunc ( a _Content ) {
var content = a _Content ;
for ( let i = 0 ; i < a _RemoveBeforeList . length ; i ++ ) {
var r = a _RemoveBeforeList [ i ] ;
content = RemoveBeforeSplitter ( content , r ) ;
}
return content ;
}
return ClearTextFunc
}
// Работа с контейнерами
function GetImageInContainers ( a _Elements , a _TextAlign ) {
var i ;
var img _src = '' ;
var re = new RegExp ( "(https?:\/\/.*\.(?:png|jpg))" ) ;
for ( i in a _Elements ) {
var e = a _Elements [ i ] ;
var children = e . querySelectorAll ( "*" ) ;
for ( let i = 0 ; i < children . length ; i ++ ) {
var c = children [ i ] ;
if ( c . nodeName == 'IMG' && re . test ( c . src ) ) {
img _src = c . src ;
}
}
}
if ( img _src . length > 0 ) {
return '<p style = "text-align:' + a _TextAlign + ';"><img src = "' + img _src + '" width = "600px"/></p>' ;
}
return '' ;
}
function GetContentInContainers ( a _Elements , a _GrubTextFunc , a _FinishWorkFunc ) {
var result = '' ;
for ( var i in a _Elements ) {
var e = a _Elements [ i ] ;
var content = '' ;
if ( e . querySelectorAll ) {
var children = e . querySelectorAll ( "*" ) ;
if ( children . length == 0 || e . innerText ) {
content += a _GrubTextFunc ( e ) ;
}
else {
for ( let i = 0 ; i < children . length ; i ++ ) {
var c = children [ i ] ;
content += a _GrubTextFunc ( c ) ;
}
}
}
if ( a _FinishWorkFunc ) {
result += a _FinishWorkFunc ( content , e ) ;
}
else {
result += content ;
}
}
return result ;
}
// Фильтрация элементов
function FIlterElements ( a _Elements , a _ElementChecker ) {
var result = [ ] ;
for ( let i = 0 ; i < a _Elements . length ; i ++ ) {
var e = a _Elements [ i ] ;
if ( a _ElementChecker ( e ) ) {
result . push ( e ) ;
}
}
return result ;
}
function ElementCheckerTrue ( a _Element ) {
return true ;
}
function ElementCheckerRia ( a _Element ) {
if ( a _Element . dataset . type == 'text' || a _Element . dataset . type == 'quote' || a _Element . dataset . type == 'list' ) {
return true ;
}
return false ;
}
function ElementCheckerZV ( a _Element ) {
if ( a _Element . itemprop == 'articleBody' ) {
return true ;
}
return false ;
}
function GrubTextFuncTemplate ( ) {
function GrubTextFunc ( a _Element ) {
var content = '' ;
if ( a _Element . innerText ) {
content = TrimString ( a _Element . textContent ) ;
}
return content ;
}
return GrubTextFunc
}
// Создание контента для стандартных новостей
function MakeContentByNews ( a _BaseElementTitle , a _BaseElementImage , a _BaseElementText , a _TitleRegExpElementPattern , a _ImageRegExpElementPattern , a _TextRegExpElementPattern , a _ElementChecker , a _ClearTextPatterns ) {
var title _tag = 'h2' ;
var p _tag = 'p' ;
var title _finish _text _func = FinishWorkFuncTemplate ( title _tag , 'center' )
var grub _text _func = GrubTextFuncTemplate ( )
var content = '' ;
var paragraph _finish _text _func = FinishWorkFuncTemplate ( p _tag , 'justify' , ClearTextFuncTemplate ( a _ClearTextPatterns ) ) ;
content += GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , a _TitleRegExpElementPattern , a _BaseElementTitle ) , grub _text _func , title _finish _text _func ) ;
content += GetImageInContainers ( FindElementsByRegExp ( GetElementClassName , a _ImageRegExpElementPattern , a _BaseElementImage ) , 'center' ) ;
content += GetContentInContainers ( FIlterElements ( FindElementsByRegExp ( GetElementClassName , a _TextRegExpElementPattern , a _BaseElementText ) , a _ElementChecker ) , grub _text _func , paragraph _finish _text _func ) ;
return content ;
}
// Создание контента для сайта
function MakeContent ( ) {
var content = '' ;
var source _add = true ;
var zero _tag _func = FinishWorkFuncTemplate ( )
var grub _text _func = GrubTextFuncTemplate ( )
if ( location . hostname == 'tass.ru' ) {
let base _element = document . getElementById ( 'content_box' ) ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element ,
'tass_pkg_title--variant_h1_default.*' ,
'Image_wrapper_.*' ,
'Paragraph_paragraph.*' ,
ElementCheckerTrue ,
[ '/ТАСС/. ' ]
) ;
}
else if ( location . hostname == 'ria.ru' ) {
let base _element = document . getElementsByClassName ( 'article__header' ) [ 0 ] ;
var base _element _text = document . getElementsByClassName ( 'article__body' ) [ 0 ] ;
content = MakeContentByNews (
base _element ,
base _element ,
base _element _text ,
'article__title' ,
'photoview__open' ,
'article__block' ,
ElementCheckerRia ,
[ '– РИА Новости. ' , '— РИА Новости. ' ]
) ;
}
else if ( location . hostname == 'zakonvremeni.ru' ) {
let base _element = document . getElementsByClassName ( 'item-page' ) [ 0 ] ;
var title = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'page-header' , base _element ) , grub _text _func ) ;
var parent _category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'parent-category-name' , base _element ) , grub _text _func ) ;
var category = GetContentInContainers ( FindElementsByRegExp ( GetElementClassName , 'category-name' , base _element ) , grub _text _func ) ;
var page = RemoveAfterSplitter ( TrimString ( document . getElementsByClassName ( 'item-page' ) [ 0 ] . querySelector ( '[itemprop=articleBody]' ) . textContent ) , '.' , true ) ;
content = title + '\n' + parent _category + ' ' + category + '\n\n' + page + '\n' + document . URL ;
source _add = false ;
}
var result = '' ;
if ( content . length > 0 ) {
result = '<textarea id = "news_content" rows="10" cols="100">' + content ;
if ( source _add ) {
result += '<p style="text-align: justify;">Источник: <a href = "' + ClearUrl ( document . URL ) + '">' + location . hostname + '</a></p>' ;
}
result += '</textarea>' ;
}
return result ;
}
var content = MakeContent ( ) ;
var logo = document . createElement ( "div" ) ;
logo . innerHTML = '<div style="margin: 0pt auto; width: 800px; text-align: center;">' + content + '</div>' ;
document . body . insertBefore ( logo , document . body . firstChild ) ;
} ) ( ) ;