/** * @license * Copyright 2022 JsData. All rights reserved. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ========================================================================== */ import DataFrame from "../core/frame"; import { ArrayType1D, ArrayType2D } from "../shared/types"; import Series from "../core/series"; /** * The class performs all groupby operation on a dataframe * involving all aggregate funciton * @param {colDict} colDict Object of unique keys in the group by column * @param {keyCol} keyCol Array contains the column names * @param {data} Array the dataframe data * @param {columnName} Array of all column name in the dataframe. * @param {colDtype} Array columns dtype */ export default class Groupby { colDict: { [key: string]: {}; }; keyCol: ArrayType1D; data?: ArrayType2D | null; columnName: ArrayType1D; colDtype: ArrayType1D; colIndex: ArrayType1D; groupDict?: any; groupColNames?: Array; keyToValue: { [key: string]: ArrayType1D; }; constructor(keyCol: ArrayType1D, data: ArrayType2D | null, columnName: ArrayType1D, colDtype: ArrayType1D, colIndex: ArrayType1D); /** * Generate group object data needed for group operations * let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ] ]; * let cols = [ "A", "B", "C" ]; * let df = new dfd.DataFrame(data, { columns: cols }); * let groupDf = df.groupby([ "A" ]); * The following internal object is generated and save to this.colDict * { * '1': { A: [ 1 ], B: [ 2 ], C: [ 3 ] }, * '4': { A: [ 4 ], B: [ 5 ], C: [ 6 ] }, * '20': { A: [ 20 ], B: [ 30 ], C: [ 40 ] }, * '39': { A: [ 39 ], B: [ 89 ], C: [ 78 ] } * } * Since for groupby using more than one columns is index via '-' * e.g for df.groupby(['A','B']) * the result will look like this * { * '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]}, * '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]} * } * but in doing analysis on a specific column like this * df.groupby(['A','B']).col(['C']) * will have the following set of internal result * { * '1-2': { C: [ 3 ]}, * '4-5': {C: [ 6 ]} * } * In building our multindex type of DataFrame for this data, * we've somehow loose track of value for column A and B. * This could actually be generated by using split('-') on the object keys * e.g '1-2'.split('-') will give us the value for A and B. * But we might have weird case scenerio where A and B value has '-` * e.g * { * '1--2-': { C: [ 3 ]}, * '4--5-': {C: [ 6 ]} * } * using `.split('-') might not work well * Hence we create a key-value `keyToValue` object to store index and their * associated value * NOTE: In the previous implementation we made use of Graph representation * for the group by data and Depth First search (DFS). But we decided to use key-value * object in javascript as an hashmap to reduce search time compared to using Grpah and DFS */ group(): Groupby; /** * Generate new internal groupby data * group = df.groupby(['A', 'B']).col('C') * This filter the colDict property as generated by `.group()` * it filter each group to contain only column `C` in their internal object * e.g * { * '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]}, * '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]} * } * to * { * '1-2': { C: [ 3 ]}, * '4-5': {C: [ 6 ]} * } * @param colNames column names * @return Groupby */ col(colNames: ArrayType1D | undefined): Groupby; /** * Perform all groupby arithmetic operations * In the previous implementation all groups data are * stord as DataFrame, which involve lot of memory usage * Hence each groups are just pure javascrit object * and all arithmetic operation is done directly on javascript * arrays. * e.g * using this internal data * { * '1-2': {A: [ 1,3 ], B: [ 2,5 ], C: [ 3, 5 ]}, * '4-5': {A: [ 4,1 ], B: [ 5,0 ], C: [ 6, 12 ]} * } * 1) using groupby(['A', 'B']).arithmetic("mean") * result: * { * '1-2': {A_mean: [ 2 ], B_mean: [ 3.5 ], C_mean: [ 4 ]}, * '4-5': {A_mean: [ 2.5 ], B: [ 2.5 ], C_mean: [ 9 ]} * } * 2) .arithmetic({ * A: 'mean', * B: 'sum', * C: 'min' * }) * result: * { * '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ]}, * '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ]} * } * 3) .arithmetic({ * A: 'mean', * B: 'sum', * C: ['min', 'max'] * }) * result: * { * '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ], C_max: [5]}, * '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ], C_max: [12]} * } * @param operation */ private arithemetic; /** * Peform all arithmetic logic * @param colVal * @param ops */ private groupMathLog; /** * Takes in internal groupby internal data and convert * them to a single data frame. * @param colDict */ private toDataFrame; private operations; /** * Obtain the count for each group * @returns DataFrame * */ count(): DataFrame; /** * Obtain the sum of columns for each group * @returns DataFrame * */ sum(): DataFrame; /** * Obtain the standard deviation of columns for each group * @returns DataFrame */ std(): DataFrame; /** * Obtain the variance of columns for each group * @returns DataFrame */ var(): DataFrame; /** * Obtain the mean of columns for each group * @returns DataFrame */ mean(): DataFrame; /** * Obtain the cumsum of columns for each group * @returns DataFrame * */ cumSum(): DataFrame; /** * Obtain the cummax of columns for each group * @returns DataFrame */ cumMax(): DataFrame; /** * Obtain the cumprod of columns for each group * @returns DataFrame */ cumProd(): DataFrame; /** * Obtain the cummin of columns for each group * @returns DataFrame */ cumMin(): DataFrame; /** * Obtain the max value of columns for each group * @returns DataFrame * */ max(): DataFrame; /** * Obtain the min of columns for each group * @returns DataFrame */ min(): DataFrame; /** * Obtain a specific group * @param keys Array * @returns DataFrame */ getGroup(keys: Array): DataFrame; /** * Perform aggregation on all groups * @param ops * @returns DataFrame */ agg(ops: { [key: string]: Array | string; }): DataFrame; /** * Apply custom aggregator function * to each group * @param callable * @returns DataFrame * @example * let grp = df.groupby(['A']) * grp.apply((x) => x.count()) */ apply(callable: (x: DataFrame) => DataFrame | Series): DataFrame; private concatGroups; /** * obtain the total number of groups * @returns number */ get ngroups(): number; /** * obtaind the internal group data * @returns {[keys: string]: {}} */ get groups(): { [keys: string]: {}; }; /** * Obtain the first row of each group * @returns DataFrame */ first(): DataFrame; /** * Obtain the last row of each group * @returns DataFrame */ last(): DataFrame; /** * Obtains the dataframe se of each groups * @returns DataFrame */ size(): DataFrame; private colKeyDict; }