/**
 * Utilities for inferring experiment metadata from a dataset.
 */
import { DatasetId, MetadataColumnValue } from "../types";
import { getUniqueValuesByColumnDB } from "../util/dataset-util";
import { defaultComparator } from "../util/sorting";
import {
  DB,
  columnMatchesValueClause,
  getTableColumns,
  queryDBAsRecords,
  sanitizedColumn,
  sql,
} from "../util/sql";
import { ExperimentMetadata, Kind } from "./types";

// TODO(benkomalo): remove things related to filename type. This is a historical concept
// where we had non-standard filenames. In the SaaS world, everything is standardized.
/**
 * Expected to be kept in sync with:
 *   spr_platform/spr_platform/immunofluorescence/filename_type.py
 */
const CUT_OVER_DATE: { [K: string]: string } = {
  // Starting from assayworks-pbmcs-20210818, all experiments (barring the three
  // "exceptions" defined below) use "standard" filenames.
  assayworks: "20210818",
  spring_dev: "20210913",
  // Starting from spring-pbmcs-20210820, all experiments use "standard"
  // filenames.
  spring: "20210820",
  // As of 2021-12-11, none of these providers have used "standard" filenames,
  // but any future experiments would do so. Thus, we just pick today as the
  // cut-over date.
  amri: "20211211",
  broad: "20211211",
  celentyx: "20211211",
  phenovista: "20211211",
};

const EXCEPTIONS: { [K: string]: string[] } = {
  // assayworks-pbmcs-20210818 used "standard" filenames, but we then reverted
  // back to "assayworks" filenames until assayworks-pbmcs-20211021 for unknown
  // reasons.
  assayworks: ["20210823", "20210830", "20210906"],

  // This is a fairly non-standard Dataset where the "date" is not really a date, and
  // it's not comparable as a "cutover date", so we make an explicit exception.
  broad: ["BBBC021", "BBBC021v1"],
};

/**
 * Infer the filename kind for a given experiment.
 */
export function toKind(metadata: ExperimentMetadata): Kind {
  const kind = metadata.orderProvider as Kind;
  // TODO(you): Fix this no-unnecessary-condition rule violation
  // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
  if ((EXCEPTIONS[kind] || []).includes(metadata.date)) {
    return kind;
  } else if (!CUT_OVER_DATE[kind]) {
    return "standard";
  } else if (metadata.date >= CUT_OVER_DATE[kind]) {
    return "standard";
  } else {
    return kind;
  }
}

/**
 * Parse the experiment metadata from a dataset name.
 */
export function toExperimentMetadata(dataset: DatasetId): ExperimentMetadata {
  const [orderProvider, cellType, date] = dataset.split("-");
  return { orderProvider, cellType, date };
}

/**
 * Information about a dataset which defines how comparisons should be performed on it.
 *
 * This essentially maps dataset-specific metadata columns to standardized concepts
 * which are used in analysis so that analytical modules can operate on a diversity
 * of datasets.
 */
export type ComparisonsConfig = {
  /**
   * The column in the dataset which defines what an individual "treatment" is.
   *
   * All unique values of this column should represent individual records in an
   * analysis. Aggregation will be done such that if there are replicates for
   * any singular value of this column, they will get aggregated into a single record.
   */
  treatmentColumn: string;

  /**
   * The value of `treatmentColumn` which represents the negative control.
   *
   * This specifies the reference value for which all comparisons are made against.
   */
  controlValue: MetadataColumnValue;

  /**
   * Columns for which comparis should be stratified/segmented by.
   *
   * This is typically something like ["plate"] or ["plate", "condition"]. The
   * unique values of these columns represents segments within which comparisons
   * will be done (so that raw values will not be compared across stratification
   * columns).
   */
  stratifyColumns: string[];
};

// TODO(you): Fix this no-unused-exports rule violation
// ts-unused-exports:disable-next-line
export const INDEX_COLUMNS = [
  "dataset",
  "plate",
  "well",
  "field",
  "row",
  "column",
];
// TODO(you): Fix this no-unused-exports rule violation
// ts-unused-exports:disable-next-line
export const SPURIOUS_COLUMNS = ["z_layer"]; // This is a weird artifact of ingestion that we probably should just strip
// TODO(you): Fix this no-unused-exports rule violation
// ts-unused-exports:disable-next-line
export const PALETTE_IDENTIFIER_COLUMNS = ["palette_number"];

export function isIndexColumn(column: string) {
  return INDEX_COLUMNS.includes(column);
}

async function getFilteredColumns(db: DB) {
  return (
    await getTableColumns(db, "sample_metadata", (row) => {
      // Currently we only want to operate on columns that are string typed
      return row["column_type"] === "VARCHAR";
    })
  ).filter(
    (column) =>
      /^[a-z0-9_ -]+$/.test(column.toLowerCase()) &&
      !isIndexColumn(column) &&
      !SPURIOUS_COLUMNS.includes(column) &&
      !PALETTE_IDENTIFIER_COLUMNS.includes(column),
  );
}

export async function getPotentialGroupByColumns(
  sampleMetadataDB: DB,
  config: ComparisonsConfig,
  mustMatchControlCounts: boolean = true,
): Promise<{ column: string; values: string[] }[]> {
  const columns = await getFilteredColumns(sampleMetadataDB);

  const stats = await Promise.all(
    columns.map((column) =>
      (async () => {
        return {
          column,
          values: await queryDBAsRecords<{ value: string; count: bigint }>(
            sampleMetadataDB,
            sql`SELECT ${sanitizedColumn(column)} as value, COUNT(*) AS count
            FROM sample_metadata
            GROUP BY ${sanitizedColumn(column)}`,
          ),
          controlCounts: await queryDBAsRecords<{
            value: string;
            count: bigint;
          }>(
            sampleMetadataDB,
            sql`SELECT "${column}" as value, COUNT(*) AS count 
            FROM sample_metadata 
            WHERE ${columnMatchesValueClause(
              config.treatmentColumn,
              config.controlValue,
            )}
            GROUP BY "${column}"`,
          ),
        };
      })(),
    ),
  );

  return stats
    .filter(
      (stat) =>
        // Only makes sense as a grouping if there's more than one value
        // and will only work if every value for the grouping has an entry for the control
        stat.values.length > 1 &&
        (mustMatchControlCounts
          ? stat.values.length === stat.controlCounts.length
          : true),
    )
    .sort((a, b) => b.controlCounts.length - a.controlCounts.length)
    .map((entry) => ({
      column: entry.column,
      values: entry.controlCounts
        .sort((a, b) => Number(b.count) - Number(a.count))
        .map((entry) => entry.value),
    }));
}

// Currently a column is only eligible if:
// - It has string values
// - It only has one value for any given plate+well
// - It has more than one unique value across the dataset
export async function getPotentialTreatmentColumns(sampleMetadataDB: DB) {
  const columns = await getFilteredColumns(sampleMetadataDB);

  if (columns.length === 0) {
    return [];
  }

  const countPerPlateWell = await queryDBAsRecords<{
    [column: string]: bigint;
  }>(
    sampleMetadataDB,
    sql`SELECT ${columns
      .map(
        (column) =>
          `COUNT(DISTINCT ${sanitizedColumn(column)}) as ${sanitizedColumn(
            column,
          )}`,
      )
      .join(", ")} FROM sample_metadata GROUP BY plate, well`,
  );

  const countOverall = await queryDBAsRecords<{
    [column: string]: bigint;
  }>(
    sampleMetadataDB,
    sql`SELECT ${columns
      .map(
        (column) =>
          `COUNT(DISTINCT ${sanitizedColumn(column)}) as ${sanitizedColumn(
            column,
          )}`,
      )
      .join(", ")} FROM sample_metadata`,
  ).then((rows) => {
    return rows[0];
  });

  const filtered = columns.filter(
    (column) =>
      !countPerPlateWell.some((counts) => counts[column] > 1) &&
      countOverall[column] > 1,
  );
  filtered.sort(defaultComparator);
  return filtered;
}

export async function getValidControlValues(
  sampleMetadataDB: DB,
  treatmentColumn: string,
): Promise<MetadataColumnValue[]> {
  return queryDBAsRecords<{ value: MetadataColumnValue }>(
    sampleMetadataDB,
    sql`SELECT "${treatmentColumn}" as value, COUNT(1) as n FROM sample_metadata GROUP BY "${treatmentColumn}" ORDER BY n DESC, "${treatmentColumn}" ASC`,
  ).then((r) => r.map((r) => (treatmentColumn ? r.value : null)));
}

/**
 * Infer a {@code ComparisonConfig} from dataset metadata.
 */
export async function getComparisonsConfig(
  sampleMetadataDB: DB,
): Promise<ComparisonsConfig> {
  const columns = await getPotentialTreatmentColumns(sampleMetadataDB);
  const columnsLowerCase = columns.map((c) => c.toLowerCase());
  const lowerCaseToColumnMap = Object.fromEntries(
    columns.map((c) => [c.toLowerCase(), c]),
  );
  // TODO(benkomalo): this totally ignores concentration or timepoint; if either are
  // present then we really need to create a synthetic column that represents the
  // joined set of treatment + concentration (or we need to change the ComparisonConfig
  // type to have treatmentColumns as an array.
  const match: string | undefined = [
    "treatment_id",
    "treatment_name",
    "treatment",
    "compound_name",
    "compound_id",
    "compound",
    "id",
  ].find((c) => columnsLowerCase.includes(c));

  let treatmentColumn: string | undefined;
  if (match) {
    treatmentColumn = lowerCaseToColumnMap[match];
  } else {
    // Just guess that they'd want to analyze by looking at the column with the most
    // values.
    const uniqueValues = await getUniqueValuesByColumnDB(sampleMetadataDB);
    columns.sort((a, b) => {
      // TODO(you): Fix this no-unnecessary-condition rule violation
      // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
      const countA = uniqueValues[a]?.size ?? 0;
      // TODO(you): Fix this no-unnecessary-condition rule violation
      // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
      const countB = uniqueValues[b]?.size ?? 0;
      return countB - countA;
    });
    treatmentColumn = columns.at(0);
  }

  if (!treatmentColumn) {
    throw new Error("Unable to infer treatment column");
  }

  const controlValue = inferLikelyNegativeControl(
    await getValidControlValues(sampleMetadataDB, treatmentColumn),
  );

  // TODO(benkomalo): this totally ignores donor, condition, and other dimensions
  // for which we might want to stratify. It may be hard to generalize those.
  const stratifyColumns = ["plate"];
  return { treatmentColumn, controlValue, stratifyColumns };
}

/**
 * Given a set of values all belonging to the same column, infer the negative control.
 *
 * It's expected that values are pre-sorted in terms of highest frequency/replicate
 * counts first.
 */
export function inferLikelyNegativeControl(
  values: MetadataColumnValue[],
): MetadataColumnValue {
  const isOfOptionalType = (items: MetadataColumnValue[], typename: string) => {
    return items.every((v) => v === null || typeof v === typename);
  };

  if (isOfOptionalType(values, "string")) {
    const regexes = [
      "negative control",
      "control",
      "dmso",
      "pbs",
      "untreated",
      "untreated control",
      "vehicle",
    ].map((s) => new RegExp(`\\b${s}\\b`, "i"));
    const includesInterestingTerms = values.find(
      (v) => typeof v === "string" && regexes.some((r) => r.test(v as string)),
    );
    if (includesInterestingTerms) {
      return includesInterestingTerms;
    }
  } else if (isOfOptionalType(values, "number")) {
    const matchedZero = values.find((v) => typeof v === "number" && v === 0);
    if (matchedZero) {
      return matchedZero;
    }
  }

  return values[0];
}
