首页 文章

BigQuery join和UDF

提问于
浏览
1

如何在select语句中连接两个表,我也使用UDF?我将SQL查询和UDF函数存储在两个我通过bq命令行调用的文件中 . 但是,当我运行它时,我收到以下错误:

查询操作中的BigQuery错误:处理作业'[projectID]时出错:bqjob_ [error_number]':无法解析表名:缺少数据集名称 .

请注意,我通过gcloud auth方法登录了正确的项目 . 我的SQL语句:

SELECT
  substr(date,1,6) as date,
  device,
  channelGroup,
  COUNT(DISTINCT CONCAT(fullVisitorId,cast(visitId as string))) AS sessions,
  COUNT(DISTINCT fullVisitorId) AS users,
FROM
  defaultChannelGroup(
    SELECT
      a.date,
      a.device.deviceCategory AS device,
      b.hits.page.pagePath AS page,
      a.fullVisitorId,
      a.visitId,
      a.trafficSource.source AS trafficSourceSource,
      a.trafficSource.medium AS trafficSourceMedium,
      a.trafficSource.campaign AS trafficSourceCampaign
    FROM FLATTEN(
      SELECT date,device.deviceCategory,trafficSource.source,trafficSource.medium,trafficSource.campaign,fullVisitorId,visitID
      FROM
        TABLE_DATE_RANGE([datasetname.ga_sessions_],TIMESTAMP('2016-10-01'),TIMESTAMP('2016-10-31'))
    ,hits) as a
    LEFT JOIN FLATTEN(
      SELECT hits.page.pagePath,hits.time,visitID,fullVisitorId
      FROM
        TABLE_DATE_RANGE([datasetname.ga_sessions_],TIMESTAMP('2016-10-01'),TIMESTAMP('2016-10-31'))
      WHERE
        hits.time = 0
        and trafficSource.medium = 'organic'
    ,hits) as b
    ON a.fullVisitorId = b.fullVisitorId AND a.visitID = b.visitID
  )
GROUP BY
  date,
  device,
  channelGroup
ORDER BY sessions DESC

我用我正确的名称替换了我的datasetname;和一些UDF(与另一个查询一起使用):

function defaultChannelGroup(row, emit)
{
  function output(channelGroup) {
    emit({channelGroup:channelGroup,
      fullVisitorId: row.fullVisitorId, 
      visitId: row.visitId,
      device: row.device,
      date: row.date
      });
  }
  computeDefaultChannelGroup(row, output);
}

bigquery.defineFunction(
  'defaultChannelGroup',
  ['date', 'device', 'page', 'trafficSourceMedium', 'trafficSourceSource', 'trafficSourceCampaign', 'fullVisitorId', 'visitId'],
  //['device', 'page', 'trafficSourceMedium', 'trafficSourceSource', 'trafficSourceCampaign', 'fullVisitorId', 'visitId'],
  [{'name': 'channelGroup', 'type': 'string'},
  {'name': 'fullVisitorId', 'type': 'string'},
  {'name': 'visitId', 'type': 'integer'},
  {'name': 'device', 'type': 'string'},
  {'name': 'date', 'type': 'string'}
],
  defaultChannelGroup
);

1 回答

  • 1

    FLATTEN函数中的select语句需要放在括号中 .

    在shell中运行bq命令: bq query --udf_resource=udf.js "$(cat query.sql)"

    query.sql包含以下脚本:

    SELECT
      substr(date,1,6) as date,
      device,
      channelGroup,
      COUNT(DISTINCT CONCAT(fullVisitorId,cast(visitId as string))) AS sessions,
      COUNT(DISTINCT fullVisitorId) AS users,
      COUNT(DISTINCT transactionId) as orders,
      CAST(SUM(transactionRevenue)/1000000 AS INTEGER) as sales
    FROM
      defaultChannelGroup(
        SELECT
          a.date as date,
          a.device.deviceCategory AS device,
          b.hits.page.pagePath AS page,
          a.fullVisitorId as fullVisitorId,
          a.visitId as visitId,
          a.trafficSource.source AS trafficSourceSource,
          a.trafficSource.medium AS trafficSourceMedium,
          a.trafficSource.campaign AS trafficSourceCampaign,
          a.hits.transaction.transactionRevenue as transactionRevenue,
          a.hits.transaction.transactionID as transactionId
        FROM FLATTEN((
          SELECT  date,device.deviceCategory,trafficSource.source,trafficSource.medium,trafficSource.campaign,fullVisitorId,visitID,
                  hits.transaction.transactionID, hits.transaction.transactionRevenue
          FROM
            TABLE_DATE_RANGE([datasetname.ga_sessions_],TIMESTAMP('2016-10-01'),TIMESTAMP('2016-10-31'))
        ),hits) as a
        LEFT JOIN FLATTEN((
          SELECT hits.page.pagePath,hits.time,trafficSource.medium,visitID,fullVisitorId
          FROM
            TABLE_DATE_RANGE([datasetname.ga_sessions_],TIMESTAMP('2016-10-01'),TIMESTAMP('2016-10-31'))
          WHERE
            hits.time = 0
            and trafficSource.medium = 'organic'
        ),hits) as b
        ON a.fullVisitorId = b.fullVisitorId AND a.visitID = b.visitID
      )
    GROUP BY
      date,
      device,
      channelGroup
    ORDER BY sessions DESC
    

    和udf.js包含以下函数(不包括'computeDefaultChannelGroup'函数):

    function defaultChannelGroup(row, emit)
    {
      function output(channelGroup) {
        emit({channelGroup:channelGroup,
          date: row.date,
          fullVisitorId: row.fullVisitorId, 
          visitId: row.visitId,
          device: row.device,
          transactionId: row.transactionId,
          transactionRevenue: row.transactionRevenue,
          });
      }
      computeDefaultChannelGroup(row, output);
    }
    
    bigquery.defineFunction(
      'defaultChannelGroup',
      ['date', 'device', 'page', 'trafficSourceMedium', 'trafficSourceSource', 'trafficSourceCampaign', 'fullVisitorId', 'visitId', 'transactionId', 'transactionRevenue'],
      [{'name': 'channelGroup', 'type': 'string'},
      {'name': 'date', 'type': 'string'},
      {'name': 'fullVisitorId', 'type': 'string'},
      {'name': 'visitId', 'type': 'integer'},
      {'name': 'device', 'type': 'string'},
      {'name': 'transactionId', 'type': 'string'},
      {'name': 'transactionRevenue', 'type': 'integer'}
    ],
      defaultChannelGroup
    );
    

    无误地运行并匹配Google Analytics中的数据 .

相关问题