Mailing List Archive

[MediaWiki-commits] [Gerrit] analytics/refinery[master]: Add ISP data to webrequest table
Joal has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/405899 )

Change subject: Add ISP data to webrequest table
......................................................................

Add ISP data to webrequest table

Update script for table creation and modify oozie loading job.

Bug: T167907
Change-Id: Id2f73e702b020459e2348a31e429df0ca0b30a2d
---
M hive/webrequest/create_webrequest_table.hql
M oozie/webrequest/load/bundle.properties
M oozie/webrequest/load/refine_webrequest.hql
3 files changed, 7 insertions(+), 4 deletions(-)


git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/99/405899/1

diff --git a/hive/webrequest/create_webrequest_table.hql b/hive/webrequest/create_webrequest_table.hql
index 9837680..749d1a6 100644
--- a/hive/webrequest/create_webrequest_table.hql
+++ b/hive/webrequest/create_webrequest_table.hql
@@ -55,7 +55,8 @@
`pageview_info` map<string, string> COMMENT 'map containing project, language_variant and page_title values only when is_pageview = TRUE.',
`page_id` bigint COMMENT 'MediaWiki page_id for this page title. For redirects this could be the page_id of the redirect or the page_id of the target. This may not always be set, even if the page is actually a pageview.',
`namespace_id` int COMMENT 'MediaWiki namespace_id for this page title. This may not always be set, even if the page is actually a pageview.',
- `tags` array<string> COMMENT 'List containing tags qualifying the request, ex: [portal, wikidata]. Will be used to split webrequest into smaller subsets.'
+ `tags` array<string> COMMENT 'List containing tags qualifying the request, ex: [portal, wikidata]. Will be used to split webrequest into smaller subsets.',
+ `isp_data` map<string, string> COMMENT 'Internet Service Provider data in a map with keys isp, organization, autonomous_system_organization and autonomous_system_number'

)
PARTITIONED BY (
diff --git a/oozie/webrequest/load/bundle.properties b/oozie/webrequest/load/bundle.properties
index ef592c8..58760ca 100644
--- a/oozie/webrequest/load/bundle.properties
+++ b/oozie/webrequest/load/bundle.properties
@@ -57,10 +57,10 @@
webrequest_table = wmf.webrequest

# Version of Hive UDF jar to import
-refinery_jar_version = 0.0.53
+refinery_jar_version = 0.0.58

# Record version to keep track of changes
-record_version = 0.0.19
+record_version = 0.0.20

# Hive table name.
statistics_table = wmf_raw.webrequest_sequence_stats
diff --git a/oozie/webrequest/load/refine_webrequest.hql b/oozie/webrequest/load/refine_webrequest.hql
index 591f83c..a6e7676 100644
--- a/oozie/webrequest/load/refine_webrequest.hql
+++ b/oozie/webrequest/load/refine_webrequest.hql
@@ -57,6 +57,7 @@
CREATE TEMPORARY FUNCTION get_pageview_info AS 'org.wikimedia.analytics.refinery.hive.GetPageviewInfoUDF';
CREATE TEMPORARY FUNCTION normalize_host AS 'org.wikimedia.analytics.refinery.hive.HostNormalizerUDF';
CREATE TEMPORARY FUNCTION get_tags AS 'org.wikimedia.analytics.refinery.hive.GetWebrequestTagsUDF';
+CREATE TEMPORARY FUNCTION isp_data as 'org.wikimedia.analytics.refinery.hive.GetISPDataUDF';


INSERT OVERWRITE TABLE ${destination_table}
@@ -113,7 +114,8 @@
WHEN '-' THEN NULL
ELSE str_to_map(x_analytics, '\;', '=')['ns']
END as namespace_id,
- get_tags(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as tags
+ get_tags(uri_host, uri_path, uri_query, http_status, content_type, user_agent, x_analytics) as tags,
+ isp_data(ip) as isp_data
FROM
${source_table}
WHERE

--
To view, visit https://gerrit.wikimedia.org/r/405899
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id2f73e702b020459e2348a31e429df0ca0b30a2d
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <joal@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits