/* * Copyright © 2011 Igalia S.L. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * Some parts of this file based on the Midori's 'adblock' extension, * licensed with the GNU Lesser General Public License 2.1, Copyright * (C) 2009-2010 Christian Dywan and 2009 * Alexander Butenko . Check Midori's web site * at http://www.twotoasts.de * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include "uri-tester.h" #include "ephy-debug.h" #include #include #include #define DEFAULT_FILTER_URL "http://adblockplus.mozdev.org/easylist/easylist.txt" #define FILTERS_LIST_FILENAME "filters.list" #define SIGNATURE_SIZE 8 #define UPDATE_FREQUENCY 24 * 60 * 60 /* In seconds */ #define URI_TESTER_GET_PRIVATE(object) (G_TYPE_INSTANCE_GET_PRIVATE ((object), TYPE_URI_TESTER, UriTesterPrivate)) struct _UriTesterPrivate { GSList *filters; char *data_dir; GHashTable *pattern; GHashTable *keys; GHashTable *optslist; GHashTable *urlcache; GString *blockcss; GString *blockcssprivate; }; enum { PROP_0, PROP_FILTERS, PROP_BASE_DATA_DIR, }; G_DEFINE_TYPE (UriTester, uri_tester, G_TYPE_OBJECT) /* Private functions. */ static GString * uri_tester_fixup_regexp (const char *prefix, char *src); static gboolean uri_tester_parse_file_at_uri (UriTester *tester, const char *fileuri); static char * uri_tester_ensure_data_dir (const char *base_data_dir) { char *folder; /* Ensure adblock's dir is there. */ folder = g_build_filename (base_data_dir, "adblock", NULL); g_mkdir_with_parents (folder, 0700); return folder; } static char* uri_tester_get_fileuri_for_url (UriTester *tester, const char *url) { char *filename = NULL; char *path = NULL; char *uri = NULL; if (!strncmp (url, "file", 4)) return g_strndup (url + 7, strlen (url) - 7); filename = g_compute_checksum_for_string (G_CHECKSUM_MD5, url, -1); path = g_build_filename (tester->priv->data_dir, filename, NULL); uri = g_filename_to_uri (path, NULL, NULL); g_free (filename); g_free (path); return uri; } typedef struct { UriTester *tester; char *dest_uri; } RetrieveFilterAsyncData; static void uri_tester_retrieve_filter_finished (GFile *src, GAsyncResult *result, RetrieveFilterAsyncData *data) { GError *error = NULL; if (!g_file_copy_finish (src, result, &error)) { LOG ("Error retrieving filter: %s\n", error->message); g_error_free (error); } else uri_tester_parse_file_at_uri (data->tester, data->dest_uri); g_object_unref (data->tester); g_free (data->dest_uri); g_slice_free (RetrieveFilterAsyncData, data); } static void uri_tester_retrieve_filter (UriTester *tester, const char *url, const char *fileuri) { GFile *src; GFile *dest; RetrieveFilterAsyncData *data; g_return_if_fail (IS_URI_TESTER (tester)); g_return_if_fail (url != NULL); g_return_if_fail (fileuri != NULL); src = g_file_new_for_uri (url); dest = g_file_new_for_uri (fileuri); data = g_slice_new (RetrieveFilterAsyncData); data->tester = g_object_ref (tester); data->dest_uri = g_file_get_uri (dest); g_file_copy_async (src, dest, G_FILE_COPY_OVERWRITE, G_PRIORITY_DEFAULT, NULL, NULL, NULL, (GAsyncReadyCallback)uri_tester_retrieve_filter_finished, data); g_object_unref (src); g_object_unref (dest); } static gboolean uri_tester_filter_is_valid (const char *fileuri) { GFile *file = NULL; GFileInfo *file_info = NULL; gboolean result; /* Now check if the local file is too old. */ file = g_file_new_for_uri (fileuri); file_info = g_file_query_info (file, G_FILE_ATTRIBUTE_TIME_MODIFIED, G_FILE_QUERY_INFO_NONE, NULL, NULL); result = FALSE; if (file_info) { GTimeVal current_time; GTimeVal mod_time; g_get_current_time (¤t_time); g_file_info_get_modification_time (file_info, &mod_time); if (current_time.tv_sec > mod_time.tv_sec) { gint64 expire_time = mod_time.tv_sec + UPDATE_FREQUENCY; result = current_time.tv_sec < expire_time; } g_object_unref (file_info); } g_object_unref (file); return result; } static void uri_tester_load_patterns (UriTester *tester) { GSList *filter = NULL; char *url = NULL; char *fileuri = NULL; /* Load patterns from the list of filters. */ for (filter = tester->priv->filters; filter; filter = g_slist_next(filter)) { url = (char*)filter->data; fileuri = uri_tester_get_fileuri_for_url (tester, url); if (!uri_tester_filter_is_valid (fileuri)) uri_tester_retrieve_filter (tester, url, fileuri); else uri_tester_parse_file_at_uri (tester, fileuri); g_free (fileuri); } } static void uri_tester_load_filters (UriTester *tester) { GSList *list = NULL; char *filepath = NULL; filepath = g_build_filename (tester->priv->data_dir, FILTERS_LIST_FILENAME, NULL); if (g_file_test (filepath, G_FILE_TEST_EXISTS | G_FILE_TEST_IS_REGULAR)) { GFile *file = NULL; char *contents = NULL; gsize length = 0; GError *error = NULL; file = g_file_new_for_path (filepath); if (g_file_load_contents (file, NULL, &contents, &length, NULL, &error)) { char **urls_array = NULL; char *url = NULL; int i = 0; urls_array = g_strsplit (contents, ";", -1); for (i = 0; urls_array [i]; i++) { url = g_strstrip (g_strdup (urls_array[i])); if (!g_str_equal (url, "")) list = g_slist_prepend (list, url); } g_strfreev (urls_array); g_free (contents); } if (error) { LOG ("Error loading filters from %s: %s", filepath, error->message); g_error_free (error); } g_object_unref (file); } else { /* No file exists yet, so use the default filter and save it. */ list = g_slist_prepend (list, g_strdup (DEFAULT_FILTER_URL)); } g_free (filepath); uri_tester_set_filters (tester, g_slist_reverse(list)); } static void uri_tester_save_filters (UriTester *tester) { FILE *file = NULL; char *filepath = NULL; filepath = g_build_filename (tester->priv->data_dir, FILTERS_LIST_FILENAME, NULL); if ((file = g_fopen (filepath, "w"))) { GSList *item = NULL; char *filter = NULL; for (item = tester->priv->filters; item; item = g_slist_next (item)) { filter = g_strdup_printf ("%s;", (char*)item->data); fputs (filter, file); g_free (filter); } fclose (file); } g_free (filepath); } static inline int uri_tester_check_rule (UriTester *tester, GRegex *regex, const char *patt, const char *req_uri, const char *page_uri) { char *opts; if (!g_regex_match_full (regex, req_uri, -1, 0, 0, NULL, NULL)) return FALSE; opts = g_hash_table_lookup (tester->priv->optslist, patt); if (opts && g_regex_match_simple (",third-party", opts, G_REGEX_CASELESS, G_REGEX_MATCH_NOTEMPTY)) { if (page_uri && g_regex_match_full (regex, page_uri, -1, 0, 0, NULL, NULL)) return FALSE; } /* TODO: Domain opt check */ LOG ("blocked by pattern regexp=%s -- %s", g_regex_get_pattern (regex), req_uri); return TRUE; } static inline gboolean uri_tester_is_matched_by_pattern (UriTester *tester, const char *req_uri, const char *page_uri) { GHashTableIter iter; gpointer patt, regex; g_hash_table_iter_init (&iter, tester->priv->pattern); while (g_hash_table_iter_next (&iter, &patt, ®ex)) { if (uri_tester_check_rule(tester, regex, patt, req_uri, page_uri)) return TRUE; } return FALSE; } static inline gboolean uri_tester_is_matched_by_key (UriTester *tester, const char *opts, const char *req_uri, const char *page_uri) { UriTesterPrivate *priv = NULL; char *uri; int len; int pos = 0; GList *regex_bl = NULL; GString *guri; gboolean ret = FALSE; char sig[SIGNATURE_SIZE + 1]; priv = tester->priv; memset (&sig[0], 0, sizeof (sig)); /* Signatures are made on pattern, so we need to convert url to a pattern as well */ guri = uri_tester_fixup_regexp ("", (char*)req_uri); uri = guri->str; len = guri->len; for (pos = len - SIGNATURE_SIZE; pos >= 0; pos--) { GRegex *regex; strncpy (sig, uri + pos, SIGNATURE_SIZE); regex = g_hash_table_lookup (priv->keys, sig); /* Dont check if regex is already blacklisted */ if (!regex || g_list_find (regex_bl, regex)) continue; ret = uri_tester_check_rule (tester, regex, sig, req_uri, page_uri); if (ret) break; regex_bl = g_list_prepend (regex_bl, regex); } g_string_free (guri, TRUE); g_list_free (regex_bl); return ret; } static gboolean uri_tester_is_matched (UriTester *tester, const char *opts, const char *req_uri, const char *page_uri) { UriTesterPrivate *priv = NULL; char *value; priv = tester->priv; /* Check cached URLs first. */ if ((value = g_hash_table_lookup (priv->urlcache, req_uri))) return (value[0] != '0') ? TRUE : FALSE; /* Look for a match either by key or by pattern. */ if (uri_tester_is_matched_by_key (tester, opts, req_uri, page_uri)) { g_hash_table_insert (priv->urlcache, g_strdup (req_uri), g_strdup("1")); return TRUE; } /* Matching by pattern is pretty expensive, so do it if needed only. */ if (uri_tester_is_matched_by_pattern (tester, req_uri, page_uri)) { g_hash_table_insert (priv->urlcache, g_strdup (req_uri), g_strdup("1")); return TRUE; } g_hash_table_insert (priv->urlcache, g_strdup (req_uri), g_strdup("0")); return FALSE; } static GString * uri_tester_fixup_regexp (const char *prefix, char *src) { GString *str; int len = 0; if (!src) return NULL; str = g_string_new (prefix); /* lets strip first .* */ if (src[0] == '*') { (void)*src++; } do { switch (*src) { case '*': g_string_append (str, ".*"); break; /*case '.': g_string_append (str, "\\."); break;*/ case '?': g_string_append (str, "\\?"); break; case '|': /* FIXME: We actually need to match :[0-9]+ or '/'. Sign means "here could be port number or nothing". So bla.com^ will match bla.com/ or bla.com:8080/ but not bla.com.au/ */ case '^': case '+': break; default: g_string_append_printf (str,"%c", *src); break; } src++; } while (*src); len = str->len; /* We dont need .* in the end of url. Thats stupid */ if (str->str && str->str[len-1] == '*' && str->str[len-2] == '.') g_string_erase (str, len-2, 2); return str; } static gboolean uri_tester_compile_regexp (UriTester *tester, GString *gpatt, char *opts) { GRegex *regex; GError *error = NULL; char *patt; int len; if (!gpatt) return FALSE; patt = gpatt->str; len = gpatt->len; /* TODO: Play with optimization flags */ regex = g_regex_new (patt, G_REGEX_OPTIMIZE, G_REGEX_MATCH_NOTEMPTY, &error); if (error) { g_warning ("%s: %s", G_STRFUNC, error->message); g_error_free (error); g_regex_unref (regex); return TRUE; } if (!g_regex_match_simple ("^/.*[\\^\\$\\*].*/$", patt, G_REGEX_UNGREEDY, G_REGEX_MATCH_NOTEMPTY)) { int signature_count = 0; int pos = 0; char *sig; for (pos = len - SIGNATURE_SIZE; pos >= 0; pos--) { sig = g_strndup (patt + pos, SIGNATURE_SIZE); if (!g_regex_match_simple ("[\\*]", sig, G_REGEX_UNGREEDY, G_REGEX_MATCH_NOTEMPTY) && !g_hash_table_lookup (tester->priv->keys, sig)) { LOG ("sig: %s %s", sig, patt); g_hash_table_insert (tester->priv->keys, g_strdup (sig), g_regex_ref (regex)); g_hash_table_insert (tester->priv->optslist, g_strdup (sig), g_strdup (opts)); signature_count++; } else { if (g_regex_match_simple ("^\\*", sig, G_REGEX_UNGREEDY, G_REGEX_MATCH_NOTEMPTY) && !g_hash_table_lookup (tester->priv->pattern, patt)) { LOG ("patt2: %s %s", sig, patt); g_hash_table_insert (tester->priv->pattern, g_strdup (patt), g_regex_ref (regex)); g_hash_table_insert (tester->priv->optslist, g_strdup (patt), g_strdup (opts)); } } g_free (sig); } g_regex_unref (regex); if (signature_count > 1 && g_hash_table_lookup (tester->priv->pattern, patt)) { g_hash_table_steal (tester->priv->pattern, patt); return TRUE; } return FALSE; } else { LOG ("patt: %s%s", patt, ""); /* Pattern is a regexp chars */ g_hash_table_insert (tester->priv->pattern, g_strdup (patt), regex); g_hash_table_insert (tester->priv->optslist, g_strdup (patt), g_strdup (opts)); return FALSE; } } static char* uri_tester_add_url_pattern (UriTester *tester, char *prefix, char *type, char *line) { char **data; char *patt; GString *format_patt; char *opts; gboolean should_free; data = g_strsplit (line, "$", -1); if (!data || !data[0]) { g_strfreev (data); return NULL; } if (data[1] && data[2]) { patt = g_strconcat (data[0], data[1], NULL); opts = g_strconcat (type, ",", data[2], NULL); } else if (data[1]) { patt = data[0]; opts = g_strconcat (type, ",", data[1], NULL); } else { patt = data[0]; opts = type; } if (g_regex_match_simple ("subdocument", opts, G_REGEX_CASELESS, G_REGEX_MATCH_NOTEMPTY)) { if (data[1] && data[2]) g_free (patt); if (data[1]) g_free (opts); g_strfreev (data); return NULL; } format_patt = uri_tester_fixup_regexp (prefix, patt); LOG ("got: %s opts %s", format_patt->str, opts); should_free = uri_tester_compile_regexp (tester, format_patt, opts); if (data[1] && data[2]) g_free (patt); if (data[1]) g_free (opts); g_strfreev (data); return g_string_free (format_patt, should_free); } static inline void uri_tester_frame_add (UriTester *tester, char *line) { const char *separator = " , "; (void)*line++; (void)*line++; if (strchr (line, '\'') || (strchr (line, ':') && !g_regex_match_simple (".*\\[.*:.*\\].*", line, G_REGEX_CASELESS, G_REGEX_MATCH_NOTEMPTY))) { return; } g_string_append (tester->priv->blockcss, separator); g_string_append (tester->priv->blockcss, line); } static inline void uri_tester_frame_add_private (UriTester *tester, const char *line, const char *sep) { char **data; data = g_strsplit (line, sep, 2); if (!(data[1] && *data[1]) || strchr (data[1], '\'') || (strchr (data[1], ':') && !g_regex_match_simple (".*\\[.*:.*\\].*", data[1], G_REGEX_CASELESS, G_REGEX_MATCH_NOTEMPTY))) { g_strfreev (data); return; } if (strchr (data[0], ',')) { char **domains; int i; domains = g_strsplit (data[0], ",", -1); for (i = 0; domains[i]; i++) { g_string_append_printf (tester->priv->blockcssprivate, ";sites['%s']+=',%s'", g_strstrip (domains[i]), data[1]); } g_strfreev (domains); } else { g_string_append_printf (tester->priv->blockcssprivate, ";sites['%s']+=',%s'", data[0], data[1]); } g_strfreev (data); } static char* uri_tester_parse_line (UriTester *tester, char *line) { if (!line) return NULL; g_strchomp (line); /* Ignore comments and new lines */ if (line[0] == '!') return NULL; /* FIXME: No support for whitelisting */ if (line[0] == '@' && line[1] == '@') return NULL; /* FIXME: No support for [include] and [exclude] tags */ if (line[0] == '[') return NULL; /* Skip garbage */ if (line[0] == ' ' || !line[0]) return NULL; /* Got CSS block hider */ if (line[0] == '#' && line[1] == '#' ) { uri_tester_frame_add (tester, line); return NULL; } /* Got CSS block hider. Workaround */ if (line[0] == '#') return NULL; /* Got per domain CSS hider rule */ if (strstr (line, "##")) { uri_tester_frame_add_private (tester, line, "##"); return NULL; } /* Got per domain CSS hider rule. Workaround */ if (strchr (line, '#')) { uri_tester_frame_add_private (tester, line, "#"); return NULL; } /* Got URL blocker rule */ if (line[0] == '|' && line[1] == '|' ) { (void)*line++; (void)*line++; return uri_tester_add_url_pattern (tester, "", "fulluri", line); } if (line[0] == '|') { (void)*line++; return uri_tester_add_url_pattern (tester, "^", "fulluri", line); } return uri_tester_add_url_pattern (tester, "", "uri", line); } static gboolean uri_tester_parse_file_at_uri (UriTester *tester, const char *fileuri) { FILE *file; char line[2000]; char *path = NULL; gboolean result = FALSE; path = g_filename_from_uri (fileuri, NULL, NULL); if ((file = g_fopen (path, "r"))) { while (fgets (line, 2000, file)) g_free (uri_tester_parse_line (tester, line)); fclose (file); result = TRUE; } g_free (path); return result; } static void uri_tester_init (UriTester *tester) { UriTesterPrivate *priv = NULL; LOG ("UriTester initializing %p", tester); priv = URI_TESTER_GET_PRIVATE (tester); tester->priv = priv; priv->filters = NULL; priv->pattern = g_hash_table_new_full (g_str_hash, g_str_equal, (GDestroyNotify)g_free, (GDestroyNotify)g_regex_unref); priv->keys = g_hash_table_new_full (g_str_hash, g_str_equal, (GDestroyNotify)g_free, (GDestroyNotify)g_regex_unref); priv->optslist = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, (GDestroyNotify)g_free); priv->urlcache = g_hash_table_new_full (g_str_hash, g_str_equal, (GDestroyNotify)g_free, (GDestroyNotify)g_free); priv->blockcss = g_string_new ("z-non-exist"); priv->blockcssprivate = g_string_new (""); } static void uri_tester_constructed (GObject *object) { UriTester *tester = URI_TESTER (object); G_OBJECT_CLASS (uri_tester_parent_class)->constructed (object); uri_tester_load_filters (tester); uri_tester_load_patterns (tester); } static void uri_tester_set_property (GObject *object, guint prop_id, const GValue *value, GParamSpec *pspec) { UriTester *tester = URI_TESTER (object); switch (prop_id) { case PROP_FILTERS: uri_tester_set_filters (tester, (GSList*) g_value_get_pointer (value)); break; case PROP_BASE_DATA_DIR: tester->priv->data_dir = uri_tester_ensure_data_dir (g_value_get_string (value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); break; } } static void uri_tester_finalize (GObject *object) { UriTesterPrivate *priv = URI_TESTER_GET_PRIVATE (URI_TESTER (object)); LOG ("UriTester finalizing %p", object); g_slist_foreach (priv->filters, (GFunc) g_free, NULL); g_slist_free (priv->filters); g_free (priv->data_dir); g_hash_table_destroy (priv->pattern); g_hash_table_destroy (priv->keys); g_hash_table_destroy (priv->optslist); g_hash_table_destroy (priv->urlcache); g_string_free (priv->blockcss, TRUE); g_string_free (priv->blockcssprivate, TRUE); G_OBJECT_CLASS (uri_tester_parent_class)->finalize (object); } static void uri_tester_class_init (UriTesterClass *klass) { GObjectClass *object_class = G_OBJECT_CLASS (klass); object_class->set_property = uri_tester_set_property; object_class->constructed = uri_tester_constructed; object_class->finalize = uri_tester_finalize; g_object_class_install_property (object_class, PROP_FILTERS, g_param_spec_pointer ("filters", "filters", "filters", G_PARAM_WRITABLE)); g_object_class_install_property (object_class, PROP_BASE_DATA_DIR, g_param_spec_string ("base-data-dir", "Base data dir", "The base dir where to create the adblock data dir", NULL, G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_type_class_add_private (object_class, sizeof (UriTesterPrivate)); } UriTester * uri_tester_new (const char *base_data_dir) { g_return_val_if_fail (base_data_dir != NULL, NULL); return g_object_new (TYPE_URI_TESTER, "base-data-dir", base_data_dir, NULL); } gboolean uri_tester_test_uri (UriTester *tester, const char *req_uri, const char *page_uri, AdUriCheckType type) { /* Don't block top level documents. */ if (type == AD_URI_CHECK_TYPE_DOCUMENT) return FALSE; return uri_tester_is_matched (tester, NULL, req_uri, page_uri); } void uri_tester_set_filters (UriTester *tester, GSList *filters) { UriTesterPrivate *priv = tester->priv; if (priv->filters) { g_slist_foreach (priv->filters, (GFunc) g_free, NULL); g_slist_free (priv->filters); } /* Update private variable and save to disk. */ priv->filters = filters; uri_tester_save_filters (tester); } GSList * uri_tester_get_filters (UriTester *tester) { return tester->priv->filters; } void uri_tester_reload (UriTester *tester) { GDir *g_data_dir = NULL; /* Remove data files in the data dir first. */ g_data_dir = g_dir_open (tester->priv->data_dir, 0, NULL); if (g_data_dir) { const char *filename = NULL; char *filepath = NULL; while ((filename = g_dir_read_name (g_data_dir))) { /* Omit the list of filters. */ if (!g_strcmp0 (filename, FILTERS_LIST_FILENAME)) continue; filepath = g_build_filename (tester->priv->data_dir, filename, NULL); g_unlink (filepath); g_free (filepath); } g_dir_close (g_data_dir); } /* Load patterns from current filters. */ uri_tester_load_patterns (tester); }