Unverified Commit c682ea0d authored by Tobias Groza's avatar Tobias Groza Committed by GitHub
Browse files

Merge pull request #451 from TeamNewPipe/meta_info

Extract stream and search meta info for YouTube
parents 853a65a1 41a8ed62
package org.schabi.newpipe.extractor;
import org.schabi.newpipe.extractor.stream.Description;
import java.io.Serializable;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nonnull;
public class MetaInfo implements Serializable {
private String title = "";
private Description content;
private List<URL> urls = new ArrayList<>();
private List<String> urlTexts = new ArrayList<>();
public MetaInfo(@Nonnull final String title, @Nonnull final Description content,
@Nonnull final List<URL> urls, @Nonnull final List<String> urlTexts) {
this.title = title;
this.content = content;
this.urls = urls;
this.urlTexts = urlTexts;
}
public MetaInfo() {
}
/**
* @return Title of the info. Can be empty.
*/
@Nonnull
public String getTitle() {
return title;
}
public void setTitle(@Nonnull final String title) {
this.title = title;
}
@Nonnull
public Description getContent() {
return content;
}
public void setContent(@Nonnull final Description content) {
this.content = content;
}
@Nonnull
public List<URL> getUrls() {
return urls;
}
public void setUrls(@Nonnull final List<URL> urls) {
this.urls = urls;
}
public void addUrl(@Nonnull final URL url) {
urls.add(url);
}
@Nonnull
public List<String> getUrlTexts() {
return urlTexts;
}
public void setUrlTexts(@Nonnull final List<String> urlTexts) {
this.urlTexts = urlTexts;
}
public void addUrlText(@Nonnull final String urlText) {
urlTexts.add(urlText);
}
}
......@@ -2,12 +2,14 @@ package org.schabi.newpipe.extractor.search;
import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.ListExtractor;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler;
import javax.annotation.Nonnull;
import java.util.List;
public abstract class SearchExtractor extends ListExtractor<InfoItem> {
......@@ -57,4 +59,15 @@ public abstract class SearchExtractor extends ListExtractor<InfoItem> {
* @return whether the results comes from a corrected query or not.
*/
public abstract boolean isCorrectedSearch() throws ParsingException;
/**
* Meta information about the search query.
* <p>
* Example: on YouTube, if you search for "Covid-19",
* there is a box with information from the WHO about Covid-19 and a link to the WHO's website.
* @return additional meta information about the search query
* @throws ParsingException
*/
@Nonnull
public abstract List<MetaInfo> getMetaInfo() throws ParsingException;
}
package org.schabi.newpipe.extractor.search;
import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.ListExtractor;
import org.schabi.newpipe.extractor.ListInfo;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.*;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler;
import org.schabi.newpipe.extractor.utils.ExtractorHelper;
import java.io.IOException;
import java.util.List;
import javax.annotation.Nonnull;
public class SearchInfo extends ListInfo<InfoItem> {
private String searchString;
private String searchSuggestion;
private boolean isCorrectedSearch;
private List<MetaInfo> metaInfo;
public SearchInfo(int serviceId,
SearchQueryHandler qIHandler,
......@@ -51,6 +51,11 @@ public class SearchInfo extends ListInfo<InfoItem> {
} catch (Exception e) {
info.addError(e);
}
try {
info.setMetaInfo(extractor.getMetaInfo());
} catch (Exception e) {
info.addError(e);
}
ListExtractor.InfoItemsPage<InfoItem> page = ExtractorHelper.getItemsPageOrLogError(info, extractor);
info.setRelatedItems(page.getItems());
......@@ -87,4 +92,13 @@ public class SearchInfo extends ListInfo<InfoItem> {
public void setSearchSuggestion(String searchSuggestion) {
this.searchSuggestion = searchSuggestion;
}
@Nonnull
public List<MetaInfo> getMetaInfo() {
return metaInfo;
}
public void setMetaInfo(@Nonnull List<MetaInfo> metaInfo) {
this.metaInfo = metaInfo;
}
}
......@@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.channel.ChannelInfoItem;
......@@ -20,6 +21,7 @@ import org.schabi.newpipe.extractor.services.media_ccc.extractors.infoItems.Medi
import org.schabi.newpipe.extractor.services.media_ccc.linkHandler.MediaCCCConferencesListLinkHandlerFactory;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import javax.annotation.Nonnull;
......@@ -55,6 +57,12 @@ public class MediaCCCSearchExtractor extends SearchExtractor {
return false;
}
@Nonnull
@Override
public List<MetaInfo> getMetaInfo() {
return Collections.emptyList();
}
@Nonnull
@Override
public InfoItemsPage<InfoItem> getInitialPage() {
......
......@@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.schabi.newpipe.extractor.MediaFormat;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
......@@ -301,4 +302,10 @@ public class MediaCCCStreamExtractor extends StreamExtractor {
public List<StreamSegment> getStreamSegments() {
return Collections.emptyList();
}
@Nonnull
@Override
public List<MetaInfo> getMetaInfo() {
return Collections.emptyList();
}
}
......@@ -4,6 +4,7 @@ import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
......@@ -17,6 +18,8 @@ import org.schabi.newpipe.extractor.services.peertube.PeertubeParsingHelper;
import org.schabi.newpipe.extractor.utils.Utils;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import javax.annotation.Nonnull;
......@@ -42,6 +45,12 @@ public class PeertubeSearchExtractor extends SearchExtractor {
return false;
}
@Nonnull
@Override
public List<MetaInfo> getMetaInfo() {
return Collections.emptyList();
}
@Override
public InfoItemsPage<InfoItem> getInitialPage() throws IOException, ExtractionException {
final String pageUrl = getUrl() + "&" + START_KEY + "=0&" + COUNT_KEY + "=" + ITEMS_PER_PAGE;
......
......@@ -5,6 +5,7 @@ import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.schabi.newpipe.extractor.MediaFormat;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
......@@ -309,6 +310,12 @@ public class PeertubeStreamExtractor extends StreamExtractor {
return Collections.emptyList();
}
@Nonnull
@Override
public List<MetaInfo> getMetaInfo() {
return Collections.emptyList();
}
private String getRelatedStreamsUrl(final List<String> tags) throws UnsupportedEncodingException {
final String url = baseUrl + PeertubeSearchQueryHandlerFactory.SEARCH_ENDPOINT;
final StringBuilder params = new StringBuilder();
......
......@@ -8,6 +8,7 @@ import com.grack.nanojson.JsonParserException;
import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.InfoItemExtractor;
import org.schabi.newpipe.extractor.InfoItemsCollector;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
......@@ -22,6 +23,8 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import javax.annotation.Nonnull;
......@@ -47,6 +50,12 @@ public class SoundcloudSearchExtractor extends SearchExtractor {
return false;
}
@Nonnull
@Override
public List<MetaInfo> getMetaInfo() {
return Collections.emptyList();
}
@Nonnull
@Override
public InfoItemsPage<InfoItem> getInitialPage() throws IOException, ExtractionException {
......
......@@ -6,6 +6,7 @@ import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.schabi.newpipe.extractor.MediaFormat;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
......@@ -327,4 +328,10 @@ public class SoundcloudStreamExtractor extends StreamExtractor {
public List<StreamSegment> getStreamSegments() {
return Collections.emptyList();
}
@Nonnull
@Override
public List<MetaInfo> getMetaInfo() {
return Collections.emptyList();
}
}
package org.schabi.newpipe.extractor.services.youtube;
import com.grack.nanojson.*;
import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
......@@ -8,6 +9,7 @@ import com.grack.nanojson.JsonWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.downloader.Response;
import org.schabi.newpipe.extractor.exceptions.ContentNotAvailableException;
......@@ -15,9 +17,12 @@ import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.stream.Description;
import org.schabi.newpipe.extractor.utils.Parser;
import org.schabi.newpipe.extractor.utils.Utils;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
......@@ -28,13 +33,11 @@ import java.time.LocalDate;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeParseException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import static org.schabi.newpipe.extractor.NewPipe.getDownloader;
import static org.schabi.newpipe.extractor.utils.JsonUtils.EMPTY_STRING;
import static org.schabi.newpipe.extractor.utils.Utils.*;
import static org.schabi.newpipe.extractor.utils.Utils.HTTP;
import static org.schabi.newpipe.extractor.utils.Utils.HTTPS;
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
......@@ -76,41 +79,35 @@ public class YoutubeParsingHelper {
private static final String FEED_BASE_CHANNEL_ID = "https://www.youtube.com/feeds/videos.xml?channel_id=";
private static final String FEED_BASE_USER = "https://www.youtube.com/feeds/videos.xml?user=";
private static final String[] RECAPTCHA_DETECTION_SELECTORS = {
"form[action*=\"/das_captcha\"]",
"input[name*=\"action_recaptcha_verify\"]"
};
public static Document parseAndCheckPage(final String url, final Response response) throws ReCaptchaException {
final Document document = Jsoup.parse(response.responseBody(), url);
for (String detectionSelector : RECAPTCHA_DETECTION_SELECTORS) {
if (!document.select(detectionSelector).isEmpty()) {
throw new ReCaptchaException("reCAPTCHA challenge requested (detected with selector: \"" + detectionSelector + "\")", url);
}
private static boolean isGoogleURL(String url) {
url = extractCachedUrlIfNeeded(url);
try {
final URL u = new URL(url);
final String host = u.getHost();
return host.startsWith("google.") || host.startsWith("m.google.");
} catch (MalformedURLException e) {
return false;
}
return document;
}
public static boolean isYoutubeURL(URL url) {
String host = url.getHost();
public static boolean isYoutubeURL(final URL url) {
final String host = url.getHost();
return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com")
|| host.equalsIgnoreCase("m.youtube.com") || host.equalsIgnoreCase("music.youtube.com");
}
public static boolean isYoutubeServiceURL(URL url) {
String host = url.getHost();
public static boolean isYoutubeServiceURL(final URL url) {
final String host = url.getHost();
return host.equalsIgnoreCase("www.youtube-nocookie.com") || host.equalsIgnoreCase("youtu.be");
}
public static boolean isHooktubeURL(URL url) {
String host = url.getHost();
public static boolean isHooktubeURL(final URL url) {
final String host = url.getHost();
return host.equalsIgnoreCase("hooktube.com");
}
public static boolean isInvidioURL(URL url) {
String host = url.getHost();
public static boolean isInvidioURL(final URL url) {
final String host = url.getHost();
return host.equalsIgnoreCase("invidio.us")
|| host.equalsIgnoreCase("dev.invidio.us")
|| host.equalsIgnoreCase("www.invidio.us")
......@@ -184,7 +181,7 @@ public class YoutubeParsingHelper {
}
}
public static OffsetDateTime parseDateFrom(String textualUploadDate) throws ParsingException {
public static OffsetDateTime parseDateFrom(final String textualUploadDate) throws ParsingException {
try {
return OffsetDateTime.parse(textualUploadDate);
} catch (DateTimeParseException e) {
......@@ -247,7 +244,7 @@ public class YoutubeParsingHelper {
}
}
public static JsonObject getInitialData(String html) throws ParsingException {
public static JsonObject getInitialData(final String html) throws ParsingException {
try {
try {
final String initialData = Parser.matchGroup1("window\\[\"ytInitialData\"\\]\\s*=\\s*(\\{.*?\\});", html);
......@@ -264,10 +261,9 @@ public class YoutubeParsingHelper {
public static boolean isHardcodedClientVersionValid() throws IOException, ExtractionException {
final String url = "https://www.youtube.com/results?search_query=test&pbj=1";
Map<String, List<String>> headers = new HashMap<>();
final Map<String, List<String>> headers = new HashMap<>();
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
headers.put("X-YouTube-Client-Version",
Collections.singletonList(HARDCODED_CLIENT_VERSION));
headers.put("X-YouTube-Client-Version", Collections.singletonList(HARDCODED_CLIENT_VERSION));
final String response = getDownloader().get(url, headers).responseBody();
return response.length() > 50; // ensure to have a valid response
......@@ -390,14 +386,14 @@ public class YoutubeParsingHelper {
.end().done().getBytes("UTF-8");
// @formatter:on
Map<String, List<String>> headers = new HashMap<>();
final Map<String, List<String>> headers = new HashMap<>();
headers.put("X-YouTube-Client-Name", Collections.singletonList(HARDCODED_YOUTUBE_MUSIC_KEYS[1]));
headers.put("X-YouTube-Client-Version", Collections.singletonList(HARDCODED_YOUTUBE_MUSIC_KEYS[2]));
headers.put("Origin", Collections.singletonList("https://music.youtube.com"));
headers.put("Referer", Collections.singletonList("music.youtube.com"));
headers.put("Content-Type", Collections.singletonList("application/json"));
String response = getDownloader().post(url, headers, json).responseBody();
final String response = getDownloader().post(url, headers, json).responseBody();
return response.length() > 50; // ensure to have a valid response
}
......@@ -432,6 +428,7 @@ public class YoutubeParsingHelper {
return youtubeMusicKeys = new String[]{key, clientName, clientVersion};
}
@Nullable
public static String getUrlFromNavigationEndpoint(JsonObject navigationEndpoint) throws ParsingException {
if (navigationEndpoint.has("urlEndpoint")) {
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
......@@ -493,6 +490,7 @@ public class YoutubeParsingHelper {
* @param html whether to return HTML, by parsing the navigationEndpoint
* @return text in the JSON object or {@code null}
*/
@Nullable
public static String getTextFromObject(JsonObject textObject, boolean html) throws ParsingException {
if (isNullOrEmpty(textObject)) return null;
......@@ -500,8 +498,8 @@ public class YoutubeParsingHelper {
if (textObject.getArray("runs").isEmpty()) return null;
StringBuilder textBuilder = new StringBuilder();
for (Object textPart : textObject.getArray("runs")) {
final StringBuilder textBuilder = new StringBuilder();
for (final Object textPart : textObject.getArray("runs")) {
String text = ((JsonObject) textPart).getString("text");
if (html && ((JsonObject) textPart).has("navigationEndpoint")) {
String url = getUrlFromNavigationEndpoint(((JsonObject) textPart).getObject("navigationEndpoint"));
......@@ -523,6 +521,7 @@ public class YoutubeParsingHelper {
return text;
}
@Nullable
public static String getTextFromObject(JsonObject textObject) throws ParsingException {
return getTextFromObject(textObject, false);
}
......@@ -650,4 +649,124 @@ public class YoutubeParsingHelper {
}
}
}
@Nonnull
public static List<MetaInfo> getMetaInfo(final JsonArray contents) throws ParsingException {
final List<MetaInfo> metaInfo = new ArrayList<>();
for (final Object content : contents) {
final JsonObject resultObject = (JsonObject) content;
if (resultObject.has("itemSectionRenderer")) {
for (final Object sectionContentObject :
resultObject.getObject("itemSectionRenderer").getArray("contents")) {
final JsonObject sectionContent = (JsonObject) sectionContentObject;
if (sectionContent.has("infoPanelContentRenderer")) {
metaInfo.add(getInfoPanelContent(sectionContent.getObject("infoPanelContentRenderer")));
}
if (sectionContent.has("clarificationRenderer")) {
metaInfo.add(getClarificationRendererContent(sectionContent.getObject("clarificationRenderer")
));
}
}
}
}
return metaInfo;
}
@Nonnull
private static MetaInfo getInfoPanelContent(final JsonObject infoPanelContentRenderer)
throws ParsingException {
final MetaInfo metaInfo = new MetaInfo();
final StringBuilder sb = new StringBuilder();
for (final Object paragraph : infoPanelContentRenderer.getArray("paragraphs")) {
if (sb.length() != 0) {
sb.append("<br>");
}
sb.append(YoutubeParsingHelper.getTextFromObject((JsonObject) paragraph));
}
metaInfo.setContent(new Description(sb.toString(), Description.HTML));
if (infoPanelContentRenderer.has("sourceEndpoint")) {
final String metaInfoLinkUrl = YoutubeParsingHelper.getUrlFromNavigationEndpoint(
infoPanelContentRenderer.getObject("sourceEndpoint"));
try {
metaInfo.addUrl(new URL(Objects.requireNonNull(extractCachedUrlIfNeeded(metaInfoLinkUrl))));
} catch (final NullPointerException | MalformedURLException e) {
throw new ParsingException("Could not get metadata info URL", e);
}
final String metaInfoLinkText = YoutubeParsingHelper.getTextFromObject(
infoPanelContentRenderer.getObject("inlineSource"));
if (isNullOrEmpty(metaInfoLinkText)) {
throw new ParsingException("Could not get metadata info link text.");
}
metaInfo.addUrlText(metaInfoLinkText);
}
return metaInfo;
}
@Nonnull
private static MetaInfo getClarificationRendererContent(final JsonObject clarificationRenderer)
throws ParsingException {
final MetaInfo metaInfo = new MetaInfo();
final String title = YoutubeParsingHelper.getTextFromObject(clarificationRenderer.getObject("contentTitle"));
final String text = YoutubeParsingHelper.getTextFromObject(clarificationRenderer.getObject("text"));
if (title == null || text == null) {
throw new ParsingException("Could not extract clarification renderer content");
}
metaInfo.setTitle(title);
metaInfo.setContent(new Description(text, Description.PLAIN_TEXT));
if (clarificationRenderer.has("actionButton")) {
final JsonObject actionButton = clarificationRenderer.getObject("actionButton")
.getObject("buttonRenderer");
try {
final String url = YoutubeParsingHelper.get