# -*- coding: utf-8 -*-
from __future__ import print_function

from collections import defaultdict
from datetime import datetime
from decimal import Decimal

import numpy as np
import pytest

from pandas.compat import (
    OrderedDict, StringIO, lmap, lrange, lzip, map, range, zip)
from pandas.errors import PerformanceWarning

import pandas as pd
from pandas import (
    DataFrame, Index, MultiIndex, Panel, Series, Timestamp, compat, date_range,
    read_csv)
import pandas.core.common as com
import pandas.util.testing as tm
from pandas.util.testing import (
    assert_almost_equal, assert_frame_equal, assert_series_equal)


def test_repr():
    # GH18203
    result = repr(pd.Grouper(key='A', level='B'))
    expected = "Grouper(key='A', level='B', axis=0, sort=False)"
    assert result == expected


@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
def test_basic(dtype):

    data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)

    index = np.arange(9)
    np.random.shuffle(index)
    data = data.reindex(index)

    grouped = data.groupby(lambda x: x // 3)

    for k, v in grouped:
        assert len(v) == 3

    agged = grouped.aggregate(np.mean)
    assert agged[1] == 1

    assert_series_equal(agged, grouped.agg(np.mean))  # shorthand
    assert_series_equal(agged, grouped.mean())
    assert_series_equal(grouped.agg(np.sum), grouped.sum())

    expected = grouped.apply(lambda x: x * x.sum())
    transformed = grouped.transform(lambda x: x * x.sum())
    assert transformed[7] == 12
    assert_series_equal(transformed, expected)

    value_grouped = data.groupby(data)
    assert_series_equal(value_grouped.aggregate(np.mean), agged,
                        check_index_type=False)

    # complex agg
    agged = grouped.aggregate([np.mean, np.std])

    with tm.assert_produces_warning(FutureWarning,
                                    check_stacklevel=False):
        agged = grouped.aggregate({'one': np.mean, 'two': np.std})

    group_constants = {0: 10, 1: 20, 2: 30}
    agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
    assert agged[1] == 21

    # corner cases
    msg = "Must produce aggregated value"
    # exception raised is type Exception
    with pytest.raises(Exception, match=msg):
        grouped.aggregate(lambda x: x * 2)


def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
    key = mframe.index.codes[0]
    grouped = mframe.groupby(key)
    result = grouped.sum()

    expected = mframe.groupby(key.astype('O')).sum()
    assert_frame_equal(result, expected)

    # GH 3911, mixed frame non-conversion
    df = df_mixed_floats.copy()
    df['value'] = lrange(len(df))

    def max_value(group):
        return group.loc[group['value'].idxmax()]

    applied = df.groupby('A').apply(max_value)
    result = applied.get_dtype_counts().sort_values()
    expected = Series({'float64': 2,
                       'int64': 1,
                       'object': 2}).sort_values()
    assert_series_equal(result, expected)


def test_groupby_return_type():

    # GH2893, return a reduced type
    df1 = DataFrame(
        [{"val1": 1, "val2": 20},
         {"val1": 1, "val2": 19},
         {"val1": 2, "val2": 27},
         {"val1": 2, "val2": 12}
         ])

    def func(dataf):
        return dataf["val2"] - dataf["val2"].mean()

    result = df1.groupby("val1", squeeze=True).apply(func)
    assert isinstance(result, Series)

    df2 = DataFrame(
        [{"val1": 1, "val2": 20},
         {"val1": 1, "val2": 19},
         {"val1": 1, "val2": 27},
         {"val1": 1, "val2": 12}
         ])

    def func(dataf):
        return dataf["val2"] - dataf["val2"].mean()

    result = df2.groupby("val1", squeeze=True).apply(func)
    assert isinstance(result, Series)

    # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
    df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
    result = df.groupby('X', squeeze=False).count()
    assert isinstance(result, DataFrame)

    # GH5592
    # inconcistent return type
    df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
                           'Pony', 'Pony'], B=Series(
                               np.arange(7), dtype='int64'), C=date_range(
                                   '20130101', periods=7)))

    def f(grp):
        return grp.iloc[0]

    expected = df.groupby('A').first()[['B']]
    result = df.groupby('A').apply(f)[['B']]
    assert_frame_equal(result, expected)

    def f(grp):
        if grp.name == 'Tiger':
            return None
        return grp.iloc[0]

    result = df.groupby('A').apply(f)[['B']]
    e = expected.copy()
    e.loc['Tiger'] = np.nan
    assert_frame_equal(result, e)

    def f(grp):
        if grp.name == 'Pony':
            return None
        return grp.iloc[0]

    result = df.groupby('A').apply(f)[['B']]
    e = expected.copy()
    e.loc['Pony'] = np.nan
    assert_frame_equal(result, e)

    # 5592 revisited, with datetimes
    def f(grp):
        if grp.name == 'Pony':
            return None
        return grp.iloc[0]

    result = df.groupby('A').apply(f)[['C']]
    e = df.groupby('A').first()[['C']]
    e.loc['Pony'] = pd.NaT
    assert_frame_equal(result, e)

    # scalar outputs
    def f(grp):
        if grp.name == 'Pony':
            return None
        return grp.iloc[0].loc['C']

    result = df.groupby('A').apply(f)
    e = df.groupby('A').first()['C'].copy()
    e.loc['Pony'] = np.nan
    e.name = None
    assert_series_equal(result, e)


def test_pass_args_kwargs(ts, tsframe):

    def f(x, q=None, axis=0):
        return np.percentile(x, q, axis=axis)

    g = lambda x: np.percentile(x, 80, axis=0)

    # Series
    ts_grouped = ts.groupby(lambda x: x.month)
    agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
    apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
    trans_result = ts_grouped.transform(np.percentile, 80, axis=0)

    agg_expected = ts_grouped.quantile(.8)
    trans_expected = ts_grouped.transform(g)

    assert_series_equal(apply_result, agg_expected)
    assert_series_equal(agg_result, agg_expected, check_names=False)
    assert_series_equal(trans_result, trans_expected)

    agg_result = ts_grouped.agg(f, q=80)
    apply_result = ts_grouped.apply(f, q=80)
    trans_result = ts_grouped.transform(f, q=80)
    assert_series_equal(agg_result, agg_expected)
    assert_series_equal(apply_result, agg_expected)
    assert_series_equal(trans_result, trans_expected)

    # DataFrame
    df_grouped = tsframe.groupby(lambda x: x.month)
    agg_result = df_grouped.agg(np.percentile, 80, axis=0)
    apply_result = df_grouped.apply(DataFrame.quantile, .8)
    expected = df_grouped.quantile(.8)
    assert_frame_equal(apply_result, expected)
    assert_frame_equal(agg_result, expected, check_names=False)

    agg_result = df_grouped.agg(f, q=80)
    apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
    assert_frame_equal(agg_result, expected, check_names=False)
    assert_frame_equal(apply_result, expected)


def test_len():
    df = tm.makeTimeDataFrame()
    grouped = df.groupby([lambda x: x.year, lambda x: x.month,
                          lambda x: x.day])
    assert len(grouped) == len(df)

    grouped = df.groupby([lambda x: x.year, lambda x: x.month])
    expected = len({(x.year, x.month) for x in df.index})
    assert len(grouped) == expected

    # issue 11016
    df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
    assert len(df.groupby(('a'))) == 0
    assert len(df.groupby(('b'))) == 3
    assert len(df.groupby(['a', 'b'])) == 3


def test_basic_regression():
    # regression
    T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
    result = Series(T, lrange(0, len(T)))

    groupings = np.random.random((1100, ))
    groupings = Series(groupings, lrange(0, len(groupings))) * 10.

    grouped = result.groupby(groupings)
    grouped.mean()


@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
                                   'int32', 'int16', 'int8'])
def test_with_na_groups(dtype):
    index = Index(np.arange(10))
    values = Series(np.ones(10), index, dtype=dtype)
    labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
                     'bar', 'bar', np.nan, 'foo'], index=index)

    # this SHOULD be an int
    grouped = values.groupby(labels)
    agged = grouped.agg(len)
    expected = Series([4, 2], index=['bar', 'foo'])

    assert_series_equal(agged, expected, check_dtype=False)

    # assert issubclass(agged.dtype.type, np.integer)

    # explicitly return a float from my function
    def f(x):
        return float(len(x))

    agged = grouped.agg(f)
    expected = Series([4, 2], index=['bar', 'foo'])

    assert_series_equal(agged, expected, check_dtype=False)
    assert issubclass(agged.dtype.type, np.dtype(dtype).type)


def test_indices_concatenation_order():

    # GH 2808

    def f1(x):
        y = x[(x.b % 2) == 1] ** 2
        if y.empty:
            multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
                                    names=['b', 'c'])
            res = DataFrame(None, columns=['a'], index=multiindex)
            return res
        else:
            y = y.set_index(['b', 'c'])
            return y

    def f2(x):
        y = x[(x.b % 2) == 1] ** 2
        if y.empty:
            return DataFrame()
        else:
            y = y.set_index(['b', 'c'])
            return y

    def f3(x):
        y = x[(x.b % 2) == 1] ** 2
        if y.empty:
            multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2,
                                    names=['foo', 'bar'])
            res = DataFrame(None, columns=['a', 'b'], index=multiindex)
            return res
        else:
            return y

    df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})

    df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})

    # correct result
    result1 = df.groupby('a').apply(f1)
    result2 = df2.groupby('a').apply(f1)
    assert_frame_equal(result1, result2)

    # should fail (not the same number of levels)
    msg = "Cannot concat indices that do not have the same number of levels"
    with pytest.raises(AssertionError, match=msg):
        df.groupby('a').apply(f2)
    with pytest.raises(AssertionError, match=msg):
        df2.groupby('a').apply(f2)

    # should fail (incorrect shape)
    with pytest.raises(AssertionError, match=msg):
        df.groupby('a').apply(f3)
    with pytest.raises(AssertionError, match=msg):
        df2.groupby('a').apply(f3)


def test_attr_wrapper(ts):
    grouped = ts.groupby(lambda x: x.weekday())

    result = grouped.std()
    expected = grouped.agg(lambda x: np.std(x, ddof=1))
    assert_series_equal(result, expected)

    # this is pretty cool
    result = grouped.describe()
    expected = {name: gp.describe() for name, gp in grouped}
    expected = DataFrame(expected).T
    assert_frame_equal(result, expected)

    # get attribute
    result = grouped.dtype
    expected = grouped.agg(lambda x: x.dtype)

    # make sure raises error
    msg = "'SeriesGroupBy' object has no attribute 'foo'"
    with pytest.raises(AttributeError, match=msg):
        getattr(grouped, 'foo')


def test_frame_groupby(tsframe):
    grouped = tsframe.groupby(lambda x: x.weekday())

    # aggregate
    aggregated = grouped.aggregate(np.mean)
    assert len(aggregated) == 5
    assert len(aggregated.columns) == 4

    # by string
    tscopy = tsframe.copy()
    tscopy['weekday'] = [x.weekday() for x in tscopy.index]
    stragged = tscopy.groupby('weekday').aggregate(np.mean)
    assert_frame_equal(stragged, aggregated, check_names=False)

    # transform
    grouped = tsframe.head(30).groupby(lambda x: x.weekday())
    transformed = grouped.transform(lambda x: x - x.mean())
    assert len(transformed) == 30
    assert len(transformed.columns) == 4

    # transform propagate
    transformed = grouped.transform(lambda x: x.mean())
    for name, group in grouped:
        mean = group.mean()
        for idx in group.index:
            tm.assert_series_equal(transformed.xs(idx), mean,
                                   check_names=False)

    # iterate
    for weekday, group in grouped:
        assert group.index[0].weekday() == weekday

    # groups / group_indices
    groups = grouped.groups
    indices = grouped.indices

    for k, v in compat.iteritems(groups):
        samething = tsframe.index.take(indices[k])
        assert (samething == v).all()


def test_frame_groupby_columns(tsframe):
    mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
    grouped = tsframe.groupby(mapping, axis=1)

    # aggregate
    aggregated = grouped.aggregate(np.mean)
    assert len(aggregated) == len(tsframe)
    assert len(aggregated.columns) == 2

    # transform
    tf = lambda x: x - x.mean()
    groupedT = tsframe.T.groupby(mapping, axis=0)
    assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))

    # iterate
    for k, v in grouped:
        assert len(v.columns) == 2


def test_frame_set_name_single(df):
    grouped = df.groupby('A')

    result = grouped.mean()
    assert result.index.name == 'A'

    result = df.groupby('A', as_index=False).mean()
    assert result.index.name != 'A'

    result = grouped.agg(np.mean)
    assert result.index.name == 'A'

    result = grouped.agg({'C': np.mean, 'D': np.std})
    assert result.index.name == 'A'

    result = grouped['C'].mean()
    assert result.index.name == 'A'
    result = grouped['C'].agg(np.mean)
    assert result.index.name == 'A'
    result = grouped['C'].agg([np.mean, np.std])
    assert result.index.name == 'A'

    with tm.assert_produces_warning(FutureWarning,
                                    check_stacklevel=False):
        result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
    assert result.index.name == 'A'


def test_multi_func(df):
    col1 = df['A']
    col2 = df['B']

    grouped = df.groupby([col1.get, col2.get])
    agged = grouped.mean()
    expected = df.groupby(['A', 'B']).mean()

    # TODO groupby get drops names
    assert_frame_equal(agged.loc[:, ['C', 'D']],
                       expected.loc[:, ['C', 'D']],
                       check_names=False)

    # some "groups" with no data
    df = DataFrame({'v1': np.random.randn(6),
                    'v2': np.random.randn(6),
                    'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
                    'k2': np.array(['1', '1', '1', '2', '2', '2'])},
                   index=['one', 'two', 'three', 'four', 'five', 'six'])
    # only verify that it works for now
    grouped = df.groupby(['k1', 'k2'])
    grouped.agg(np.sum)


def test_multi_key_multiple_functions(df):
    grouped = df.groupby(['A', 'B'])['C']

    agged = grouped.agg([np.mean, np.std])
    expected = DataFrame({'mean': grouped.agg(np.mean),
                          'std': grouped.agg(np.std)})
    assert_frame_equal(agged, expected)


def test_frame_multi_key_function_list():
    data = DataFrame(
        {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
               'foo', 'foo', 'foo'],
         'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
               'two', 'two', 'one'],
         'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
               'dull', 'shiny', 'shiny', 'shiny'],
         'D': np.random.randn(11),
         'E': np.random.randn(11),
         'F': np.random.randn(11)})

    grouped = data.groupby(['A', 'B'])
    funcs = [np.mean, np.std]
    agged = grouped.agg(funcs)
    expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
                          grouped['F'].agg(funcs)],
                         keys=['D', 'E', 'F'], axis=1)
    assert (isinstance(agged.index, MultiIndex))
    assert (isinstance(expected.index, MultiIndex))
    assert_frame_equal(agged, expected)


@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()])
@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
def test_groupby_multiple_columns(df, op):
    data = df
    grouped = data.groupby(['A', 'B'])

    result1 = op(grouped)

    expected = defaultdict(dict)
    for n1, gp1 in data.groupby('A'):
        for n2, gp2 in gp1.groupby('B'):
            expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
    expected = {k: DataFrame(v)
                for k, v in compat.iteritems(expected)}
    expected = Panel.fromDict(expected).swapaxes(0, 1)
    expected.major_axis.name, expected.minor_axis.name = 'A', 'B'

    # a little bit crude
    for col in ['C', 'D']:
        result_col = op(grouped[col])
        exp = expected[col]
        pivoted = result1[col].unstack()
        pivoted2 = result_col.unstack()
        assert_frame_equal(pivoted.reindex_like(exp), exp)
        assert_frame_equal(pivoted2.reindex_like(exp), exp)

    # test single series works the same
    result = data['C'].groupby([data['A'], data['B']]).mean()
    expected = data.groupby(['A', 'B']).mean()['C']

    assert_series_equal(result, expected)


def test_groupby_as_index_agg(df):
    grouped = df.groupby('A', as_index=False)

    # single-key

    result = grouped.agg(np.mean)
    expected = grouped.mean()
    assert_frame_equal(result, expected)

    result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
    expected2 = grouped.mean()
    expected2['D'] = grouped.sum()['D']
    assert_frame_equal(result2, expected2)

    grouped = df.groupby('A', as_index=True)
    expected3 = grouped['C'].sum()
    expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})

    with tm.assert_produces_warning(FutureWarning,
                                    check_stacklevel=False):
        result3 = grouped['C'].agg({'Q': np.sum})
    assert_frame_equal(result3, expected3)

    # multi-key

    grouped = df.groupby(['A', 'B'], as_index=False)

    result = grouped.agg(np.mean)
    expected = grouped.mean()
    assert_frame_equal(result, expected)

    result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
    expected2 = grouped.mean()
    expected2['D'] = grouped.sum()['D']
    assert_frame_equal(result2, expected2)

    expected3 = grouped['C'].sum()
    expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
    result3 = grouped['C'].agg({'Q': np.sum})
    assert_frame_equal(result3, expected3)

    # GH7115 & GH8112 & GH8582
    df = DataFrame(np.random.randint(0, 100, (50, 3)),
                   columns=['jim', 'joe', 'jolie'])
    ts = Series(np.random.randint(5, 10, 50), name='jim')

    gr = df.groupby(ts)
    gr.nth(0)  # invokes set_selection_from_grouper internally
    assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))

    for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
        gr = df.groupby(ts, as_index=False)
        left = getattr(gr, attr)()

        gr = df.groupby(ts.values, as_index=True)
        right = getattr(gr, attr)().reset_index(drop=True)

        assert_frame_equal(left, right)


def test_as_index_series_return_frame(df):
    grouped = df.groupby('A', as_index=False)
    grouped2 = df.groupby(['A', 'B'], as_index=False)

    result = grouped['C'].agg(np.sum)
    expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
    assert isinstance(result, DataFrame)
    assert_frame_equal(result, expected)

    result2 = grouped2['C'].agg(np.sum)
    expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
    assert isinstance(result2, DataFrame)
    assert_frame_equal(result2, expected2)

    result = grouped['C'].sum()
    expected = grouped.sum().loc[:, ['A', 'C']]
    assert isinstance(result, DataFrame)
    assert_frame_equal(result, expected)

    result2 = grouped2['C'].sum()
    expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
    assert isinstance(result2, DataFrame)
    assert_frame_equal(result2, expected2)


def test_as_index_series_column_slice_raises(df):
    # GH15072
    grouped = df.groupby('A', as_index=False)
    msg = r"Column\(s\) C already selected"

    with pytest.raises(IndexError, match=msg):
        grouped['C'].__getitem__('D')


def test_groupby_as_index_cython(df):
    data = df

    # single-key
    grouped = data.groupby('A', as_index=False)
    result = grouped.mean()
    expected = data.groupby(['A']).mean()
    expected.insert(0, 'A', expected.index)
    expected.index = np.arange(len(expected))
    assert_frame_equal(result, expected)

    # multi-key
    grouped = data.groupby(['A', 'B'], as_index=False)
    result = grouped.mean()
    expected = data.groupby(['A', 'B']).mean()

    arrays = lzip(*expected.index.values)
    expected.insert(0, 'A', arrays[0])
    expected.insert(1, 'B', arrays[1])
    expected.index = np.arange(len(expected))
    assert_frame_equal(result, expected)


def test_groupby_as_index_series_scalar(df):
    grouped = df.groupby(['A', 'B'], as_index=False)

    # GH #421

    result = grouped['C'].agg(len)
    expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
    assert_frame_equal(result, expected)


def test_groupby_as_index_corner(df, ts):
    msg = "as_index=False only valid with DataFrame"
    with pytest.raises(TypeError, match=msg):
        ts.groupby(lambda x: x.weekday(), as_index=False)

    msg = "as_index=False only valid for axis=0"
    with pytest.raises(ValueError, match=msg):
        df.groupby(lambda x: x.lower(), as_index=False, axis=1)


def test_groupby_multiple_key(df):
    df = tm.makeTimeDataFrame()
    grouped = df.groupby([lambda x: x.year, lambda x: x.month,
                          lambda x: x.day])
    agged = grouped.sum()
    assert_almost_equal(df.values, agged.values)

    grouped = df.T.groupby([lambda x: x.year,
                            lambda x: x.month,
                            lambda x: x.day], axis=1)

    agged = grouped.agg(lambda x: x.sum())
    tm.assert_index_equal(agged.index, df.columns)
    assert_almost_equal(df.T.values, agged.values)

    agged = grouped.agg(lambda x: x.sum())
    assert_almost_equal(df.T.values, agged.values)


def test_groupby_multi_corner(df):
    # test that having an all-NA column doesn't mess you up
    df = df.copy()
    df['bad'] = np.nan
    agged = df.groupby(['A', 'B']).mean()

    expected = df.groupby(['A', 'B']).mean()
    expected['bad'] = np.nan

    assert_frame_equal(agged, expected)


def test_omit_nuisance(df):
    grouped = df.groupby('A')

    result = grouped.mean()
    expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
    assert_frame_equal(result, expected)

    agged = grouped.agg(np.mean)
    exp = grouped.mean()
    assert_frame_equal(agged, exp)

    df = df.loc[:, ['A', 'C', 'D']]
    df['E'] = datetime.now()
    grouped = df.groupby('A')
    result = grouped.agg(np.sum)
    expected = grouped.sum()
    assert_frame_equal(result, expected)

    # won't work with axis = 1
    grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
    msg = (r'\("unsupported operand type\(s\) for \+: '
           "'Timestamp' and 'float'\""
           r", u?'occurred at index 0'\)")
    with pytest.raises(TypeError, match=msg):
        grouped.agg(lambda x: x.sum(0, numeric_only=False))


def test_omit_nuisance_python_multiple(three_group):
    grouped = three_group.groupby(['A', 'B'])

    agged = grouped.agg(np.mean)
    exp = grouped.mean()
    assert_frame_equal(agged, exp)


def test_empty_groups_corner(mframe):
    # handle empty groups
    df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
                    'k2': np.array(['1', '1', '1', '2', '2', '2']),
                    'k3': ['foo', 'bar'] * 3,
                    'v1': np.random.randn(6),
                    'v2': np.random.randn(6)})

    grouped = df.groupby(['k1', 'k2'])
    result = grouped.agg(np.mean)
    expected = grouped.mean()
    assert_frame_equal(result, expected)

    grouped = mframe[3:5].groupby(level=0)
    agged = grouped.apply(lambda x: x.mean())
    agged_A = grouped['A'].apply(np.mean)
    assert_series_equal(agged['A'], agged_A)
    assert agged.index.name == 'first'


def test_nonsense_func():
    df = DataFrame([0])
    msg = r"unsupported operand type\(s\) for \+: '(int|long)' and 'str'"
    with pytest.raises(TypeError, match=msg):
        df.groupby(lambda x: x + 'foo')


def test_wrap_aggregated_output_multindex(mframe):
    df = mframe.T
    df['baz', 'two'] = 'peekaboo'

    keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
    agged = df.groupby(keys).agg(np.mean)
    assert isinstance(agged.columns, MultiIndex)

    def aggfun(ser):
        if ser.name == ('foo', 'one'):
            raise TypeError
        else:
            return ser.sum()

    agged2 = df.groupby(keys).aggregate(aggfun)
    assert len(agged2.columns) + 1 == len(df.columns)


def test_groupby_level_apply(mframe):

    result = mframe.groupby(level=0).count()
    assert result.index.name == 'first'
    result = mframe.groupby(level=1).count()
    assert result.index.name == 'second'

    result = mframe['A'].groupby(level=0).count()
    assert result.index.name == 'first'


def test_groupby_level_mapper(mframe):
    deleveled = mframe.reset_index()

    mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
    mapper1 = {'one': 0, 'two': 0, 'three': 1}

    result0 = mframe.groupby(mapper0, level=0).sum()
    result1 = mframe.groupby(mapper1, level=1).sum()

    mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
    mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
    expected0 = mframe.groupby(mapped_level0).sum()
    expected1 = mframe.groupby(mapped_level1).sum()
    expected0.index.name, expected1.index.name = 'first', 'second'

    assert_frame_equal(result0, expected0)
    assert_frame_equal(result1, expected1)


def test_groupby_level_nonmulti():
    # GH 1313, GH 13901
    s = Series([1, 2, 3, 10, 4, 5, 20, 6],
               Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
    expected = Series([11, 22, 3, 4, 5, 6],
                      Index(range(1, 7), name='foo'))

    result = s.groupby(level=0).sum()
    tm.assert_series_equal(result, expected)
    result = s.groupby(level=[0]).sum()
    tm.assert_series_equal(result, expected)
    result = s.groupby(level=-1).sum()
    tm.assert_series_equal(result, expected)
    result = s.groupby(level=[-1]).sum()
    tm.assert_series_equal(result, expected)

    msg = "level > 0 or level < -1 only valid with MultiIndex"
    with pytest.raises(ValueError, match=msg):
        s.groupby(level=1)
    with pytest.raises(ValueError, match=msg):
        s.groupby(level=-2)
    msg = "No group keys passed!"
    with pytest.raises(ValueError, match=msg):
        s.groupby(level=[])
    msg = "multiple levels only valid with MultiIndex"
    with pytest.raises(ValueError, match=msg):
        s.groupby(level=[0, 0])
    with pytest.raises(ValueError, match=msg):
        s.groupby(level=[0, 1])
    msg = "level > 0 or level < -1 only valid with MultiIndex"
    with pytest.raises(ValueError, match=msg):
        s.groupby(level=[1])


def test_groupby_complex():
    # GH 12902
    a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
    expected = Series((1 + 2j, 5 + 10j))

    result = a.groupby(level=0).sum()
    assert_series_equal(result, expected)

    result = a.sum(level=0)
    assert_series_equal(result, expected)


def test_mutate_groups():

    # GH3380

    df = DataFrame({
        'cat1': ['a'] * 8 + ['b'] * 6,
        'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
        ['d'] * 2 + ['e'] * 2,
        'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
        'val': np.random.randint(100, size=14),
    })

    def f_copy(x):
        x = x.copy()
        x['rank'] = x.val.rank(method='min')
        return x.groupby('cat2')['rank'].min()

    def f_no_copy(x):
        x['rank'] = x.val.rank(method='min')
        return x.groupby('cat2')['rank'].min()

    grpby_copy = df.groupby('cat1').apply(f_copy)
    grpby_no_copy = df.groupby('cat1').apply(f_no_copy)
    assert_series_equal(grpby_copy, grpby_no_copy)


def test_no_mutate_but_looks_like():

    # GH 8467
    # first show's mutation indicator
    # second does not, but should yield the same results
    df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})

    result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
    result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
    assert_series_equal(result1, result2)


def test_groupby_series_indexed_differently():
    s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
                index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
    s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
                index=Index(['a', 'b', 'd', 'f', 'g', 'h']))

    grouped = s1.groupby(s2)
    agged = grouped.mean()
    exp = s1.groupby(s2.reindex(s1.index).get).mean()
    assert_series_equal(agged, exp)


def test_groupby_with_hier_columns():
    tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
                         'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
                                  'one', 'two']]))
    index = MultiIndex.from_tuples(tuples)
    columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
        'B', 'cat'), ('A', 'dog')])
    df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)

    result = df.groupby(level=0).mean()
    tm.assert_index_equal(result.columns, columns)

    result = df.groupby(level=0, axis=1).mean()
    tm.assert_index_equal(result.index, df.index)

    result = df.groupby(level=0).agg(np.mean)
    tm.assert_index_equal(result.columns, columns)

    result = df.groupby(level=0).apply(lambda x: x.mean())
    tm.assert_index_equal(result.columns, columns)

    result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
    tm.assert_index_equal(result.columns, Index(['A', 'B']))
    tm.assert_index_equal(result.index, df.index)

    # add a nuisance column
    sorted_columns, _ = columns.sortlevel(0)
    df['A', 'foo'] = 'bar'
    result = df.groupby(level=0).mean()
    tm.assert_index_equal(result.columns, df.columns[:-1])


def test_grouping_ndarray(df):
    grouped = df.groupby(df['A'].values)

    result = grouped.sum()
    expected = df.groupby('A').sum()
    assert_frame_equal(result, expected, check_names=False
                       )  # Note: no names when grouping by value


def test_groupby_wrong_multi_labels():
    data = """index,foo,bar,baz,spam,data
0,foo1,bar1,baz1,spam2,20
1,foo1,bar2,baz1,spam3,30
2,foo2,bar2,baz1,spam2,40
3,foo1,bar1,baz2,spam1,50
4,foo3,bar1,baz2,spam1,60"""

    data = read_csv(StringIO(data), index_col=0)

    grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])

    result = grouped.agg(np.mean)
    expected = grouped.mean()
    assert_frame_equal(result, expected)


def test_groupby_series_with_name(df):
    result = df.groupby(df['A']).mean()
    result2 = df.groupby(df['A'], as_index=False).mean()
    assert result.index.name == 'A'
    assert 'A' in result2

    result = df.groupby([df['A'], df['B']]).mean()
    result2 = df.groupby([df['A'], df['B']],
                         as_index=False).mean()
    assert result.index.names == ('A', 'B')
    assert 'A' in result2
    assert 'B' in result2


def test_seriesgroupby_name_attr(df):
    # GH 6265
    result = df.groupby('A')['C']
    assert result.count().name == 'C'
    assert result.mean().name == 'C'

    testFunc = lambda x: np.sum(x) * 2
    assert result.agg(testFunc).name == 'C'


def test_consistency_name():
    # GH 12363

    df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'two',
                          'two', 'two', 'one', 'two'],
                    'C': np.random.randn(8) + 1.0,
                    'D': np.arange(8)})

    expected = df.groupby(['A']).B.count()
    result = df.B.groupby(df.A).count()
    assert_series_equal(result, expected)


def test_groupby_name_propagation(df):
    # GH 6124
    def summarize(df, name=None):
        return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)

    def summarize_random_name(df):
        # Provide a different name for each Series.  In this case, groupby
        # should not attempt to propagate the Series name since they are
        # inconsistent.
        return Series({
            'count': 1,
            'mean': 2,
            'omissions': 3,
        }, name=df.iloc[0]['A'])

    metrics = df.groupby('A').apply(summarize)
    assert metrics.columns.name is None
    metrics = df.groupby('A').apply(summarize, 'metrics')
    assert metrics.columns.name == 'metrics'
    metrics = df.groupby('A').apply(summarize_random_name)
    assert metrics.columns.name is None


def test_groupby_nonstring_columns():
    df = DataFrame([np.arange(10) for x in range(10)])
    grouped = df.groupby(0)
    result = grouped.mean()
    expected = df.groupby(df[0]).mean()
    assert_frame_equal(result, expected)


def test_groupby_mixed_type_columns():
    # GH 13432, unorderable types in py3
    df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
    expected = DataFrame([[1, 2]], columns=['B', 0],
                         index=Index([0], name='A'))

    result = df.groupby('A').first()
    tm.assert_frame_equal(result, expected)

    result = df.groupby('A').sum()
    tm.assert_frame_equal(result, expected)


# TODO: Ensure warning isn't emitted in the first place
@pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning")
def test_cython_grouper_series_bug_noncontig():
    arr = np.empty((100, 100))
    arr.fill(np.nan)
    obj = Series(arr[:, 0], index=lrange(100))
    inds = np.tile(lrange(10), 10)

    result = obj.groupby(inds).agg(Series.median)
    assert result.isna().all()


def test_series_grouper_noncontig_index():
    index = Index(tm.rands_array(10, 100))

    values = Series(np.random.randn(50), index=index[::2])
    labels = np.random.randint(0, 5, 50)

    # it works!
    grouped = values.groupby(labels)

    # accessing the index elements causes segfault
    f = lambda x: len(set(map(id, x.index)))
    grouped.agg(f)


def test_convert_objects_leave_decimal_alone():

    s = Series(lrange(5))
    labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')

    def convert_fast(x):
        return Decimal(str(x.mean()))

    def convert_force_pure(x):
        # base will be length 0
        assert (len(x.values.base) > 0)
        return Decimal(str(x.mean()))

    grouped = s.groupby(labels)

    result = grouped.agg(convert_fast)
    assert result.dtype == np.object_
    assert isinstance(result[0], Decimal)

    result = grouped.agg(convert_force_pure)
    assert result.dtype == np.object_
    assert isinstance(result[0], Decimal)


def test_groupby_dtype_inference_empty():
    # GH 6733
    df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
    assert df['x'].dtype == np.float64

    result = df.groupby('x').first()
    exp_index = Index([], name='x', dtype=np.float64)
    expected = DataFrame({'range': Series(
        [], index=exp_index, dtype='int64')})
    assert_frame_equal(result, expected, by_blocks=True)


def test_groupby_list_infer_array_like(df):
    result = df.groupby(list(df['A'])).mean()
    expected = df.groupby(df['A']).mean()
    assert_frame_equal(result, expected, check_names=False)

    with pytest.raises(KeyError, match=r"^'foo'$"):
        df.groupby(list(df['A'][:-1]))

    # pathological case of ambiguity
    df = DataFrame({'foo': [0, 1],
                    'bar': [3, 4],
                    'val': np.random.randn(2)})

    result = df.groupby(['foo', 'bar']).mean()
    expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]


def test_groupby_keys_same_size_as_index():
    # GH 11185
    freq = 's'
    index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
                          periods=2, freq=freq)
    df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
        'metric', 'values'
    ], index=index)
    result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
    expected = df.set_index([df.index, 'metric'])

    assert_frame_equal(result, expected)


def test_groupby_one_row():
    # GH 11741
    msg = r"^'Z'$"
    df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
    with pytest.raises(KeyError, match=msg):
        df1.groupby('Z')
    df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
    with pytest.raises(KeyError, match=msg):
        df2.groupby('Z')


def test_groupby_nat_exclude():
    # GH 6992
    df = pd.DataFrame(
        {'values': np.random.randn(8),
         'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
             '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
            pd.Timestamp('2013-01-01')],
         'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
    grouped = df.groupby('dt')

    expected = [pd.Index([1, 7]), pd.Index([3, 5])]
    keys = sorted(grouped.groups.keys())
    assert len(keys) == 2
    for k, e in zip(keys, expected):
        # grouped.groups keys are np.datetime64 with system tz
        # not to be affected by tz, only compare values
        tm.assert_index_equal(grouped.groups[k], e)

    # confirm obj is not filtered
    tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
    assert grouped.ngroups == 2

    expected = {
        Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
        Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
    }

    for k in grouped.indices:
        tm.assert_numpy_array_equal(grouped.indices[k], expected[k])

    tm.assert_frame_equal(
        grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
    tm.assert_frame_equal(
        grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])

    with pytest.raises(KeyError, match=r"^NaT$"):
        grouped.get_group(pd.NaT)

    nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
                        'nat': [pd.NaT, pd.NaT, pd.NaT]})
    assert nan_df['nan'].dtype == 'float64'
    assert nan_df['nat'].dtype == 'datetime64[ns]'

    for key in ['nan', 'nat']:
        grouped = nan_df.groupby(key)
        assert grouped.groups == {}
        assert grouped.ngroups == 0
        assert grouped.indices == {}
        with pytest.raises(KeyError, match=r"^nan$"):
            grouped.get_group(np.nan)
        with pytest.raises(KeyError, match=r"^NaT$"):
            grouped.get_group(pd.NaT)


@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
def test_sparse_friendly(df):
    sdf = df[['C', 'D']].to_sparse()
    panel = tm.makePanel()
    tm.add_nans(panel)

    def _check_work(gp):
        gp.mean()
        gp.agg(np.mean)
        dict(iter(gp))

    # it works!
    _check_work(sdf.groupby(lambda x: x // 2))
    _check_work(sdf['C'].groupby(lambda x: x // 2))
    _check_work(sdf.groupby(df['A']))

    # do this someday
    # _check_work(panel.groupby(lambda x: x.month, axis=1))


@pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
def test_panel_groupby():
    panel = tm.makePanel()
    tm.add_nans(panel)
    grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
                            axis='items')
    agged = grouped.mean()
    agged2 = grouped.agg(lambda x: x.mean('items'))

    tm.assert_panel_equal(agged, agged2)

    tm.assert_index_equal(agged.items, Index([0, 1]))

    grouped = panel.groupby(lambda x: x.month, axis='major')
    agged = grouped.mean()

    exp = Index(sorted(list(set(panel.major_axis.month))))
    tm.assert_index_equal(agged.major_axis, exp)

    grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
                            axis='minor')
    agged = grouped.mean()
    tm.assert_index_equal(agged.minor_axis, Index([0, 1]))


def test_groupby_2d_malformed():
    d = DataFrame(index=lrange(2))
    d['group'] = ['g1', 'g2']
    d['zeros'] = [0, 0]
    d['ones'] = [1, 1]
    d['label'] = ['l1', 'l2']
    tmp = d.groupby(['group']).mean()
    res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
    tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
    tm.assert_numpy_array_equal(tmp.values, res_values)


def test_int32_overflow():
    B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
                        ))
    A = np.arange(25000)
    df = DataFrame({'A': A,
                    'B': B,
                    'C': A,
                    'D': B,
                    'E': np.random.randn(25000)})

    left = df.groupby(['A', 'B', 'C', 'D']).sum()
    right = df.groupby(['D', 'C', 'B', 'A']).sum()
    assert len(left) == len(right)


def test_groupby_sort_multi():
    df = DataFrame({'a': ['foo', 'bar', 'baz'],
                    'b': [3, 2, 1],
                    'c': [0, 1, 2],
                    'd': np.random.randn(3)})

    tups = lmap(tuple, df[['a', 'b', 'c']].values)
    tups = com.asarray_tuplesafe(tups)
    result = df.groupby(['a', 'b', 'c'], sort=True).sum()
    tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])

    tups = lmap(tuple, df[['c', 'a', 'b']].values)
    tups = com.asarray_tuplesafe(tups)
    result = df.groupby(['c', 'a', 'b'], sort=True).sum()
    tm.assert_numpy_array_equal(result.index.values, tups)

    tups = lmap(tuple, df[['b', 'c', 'a']].values)
    tups = com.asarray_tuplesafe(tups)
    result = df.groupby(['b', 'c', 'a'], sort=True).sum()
    tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])

    df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
                    'b': [0, 0, 0, 1, 1, 1],
                    'd': np.random.randn(6)})
    grouped = df.groupby(['a', 'b'])['d']
    result = grouped.sum()

    def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
        tups = lmap(tuple, df[keys].values)
        tups = com.asarray_tuplesafe(tups)
        expected = f(df.groupby(tups)[field])
        for k, v in compat.iteritems(expected):
            assert (result[k] == v)

    _check_groupby(df, result, ['a', 'b'], 'd')


def test_dont_clobber_name_column():
    df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
                    'name': ['foo', 'bar', 'baz'] * 2})

    result = df.groupby('key').apply(lambda x: x)
    assert_frame_equal(result, df)


def test_skip_group_keys():

    tsf = tm.makeTimeDataFrame()

    grouped = tsf.groupby(lambda x: x.month, group_keys=False)
    result = grouped.apply(lambda x: x.sort_values(by='A')[:3])

    pieces = [group.sort_values(by='A')[:3] for key, group in grouped]

    expected = pd.concat(pieces)
    assert_frame_equal(result, expected)

    grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
    result = grouped.apply(lambda x: x.sort_values()[:3])

    pieces = [group.sort_values()[:3] for key, group in grouped]

    expected = pd.concat(pieces)
    assert_series_equal(result, expected)


def test_no_nonsense_name(frame):
    # GH #995
    s = frame['C'].copy()
    s.name = None

    result = s.groupby(frame['A']).agg(np.sum)
    assert result.name is None


def test_multifunc_sum_bug():
    # GH #1065
    x = DataFrame(np.arange(9).reshape(3, 3))
    x['test'] = 0
    x['fl'] = [1.3, 1.5, 1.6]

    grouped = x.groupby('test')
    result = grouped.agg({'fl': 'sum', 2: 'size'})
    assert result['fl'].dtype == np.float64


def test_handle_dict_return_value(df):
    def f(group):
        return {'max': group.max(), 'min': group.min()}

    def g(group):
        return Series({'max': group.max(), 'min': group.min()})

    result = df.groupby('A')['C'].apply(f)
    expected = df.groupby('A')['C'].apply(g)

    assert isinstance(result, Series)
    assert_series_equal(result, expected)


@pytest.mark.parametrize('grouper', ['A', ['A', 'B']])
def test_set_group_name(df, grouper):
    def f(group):
        assert group.name is not None
        return group

    def freduce(group):
        assert group.name is not None
        return group.sum()

    def foo(x):
        return freduce(x)

    grouped = df.groupby(grouper)

    # make sure all these work
    grouped.apply(f)
    grouped.aggregate(freduce)
    grouped.aggregate({'C': freduce, 'D': freduce})
    grouped.transform(f)

    grouped['C'].apply(f)
    grouped['C'].aggregate(freduce)
    grouped['C'].aggregate([freduce, foo])
    grouped['C'].transform(f)


def test_group_name_available_in_inference_pass():
    # gh-15062
    df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})

    names = []

    def f(group):
        names.append(group.name)
        return group.copy()

    df.groupby('a', sort=False, group_keys=False).apply(f)
    # we expect 2 zeros because we call ``f`` once to see if a faster route
    # can be used.
    expected_names = [0, 0, 1, 2]
    assert names == expected_names


def test_no_dummy_key_names(df):
    # see gh-1291
    result = df.groupby(df['A'].values).sum()
    assert result.index.name is None

    result = df.groupby([df['A'].values, df['B'].values]).sum()
    assert result.index.names == (None, None)


def test_groupby_sort_multiindex_series():
    # series multiindex groupby sort argument was not being passed through
    # _compress_group_index
    # GH 9444
    index = MultiIndex(levels=[[1, 2], [1, 2]],
                       codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
                       names=['a', 'b'])
    mseries = Series([0, 1, 2, 3, 4, 5], index=index)
    index = MultiIndex(levels=[[1, 2], [1, 2]],
                       codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
    mseries_result = Series([0, 2, 4], index=index)

    result = mseries.groupby(level=['a', 'b'], sort=False).first()
    assert_series_equal(result, mseries_result)
    result = mseries.groupby(level=['a', 'b'], sort=True).first()
    assert_series_equal(result, mseries_result.sort_index())


def test_groupby_reindex_inside_function():

    periods = 1000
    ind = date_range(start='2012/1/1', freq='5min', periods=periods)
    df = DataFrame({'high': np.arange(
        periods), 'low': np.arange(periods)}, index=ind)

    def agg_before(hour, func, fix=False):
        """
            Run an aggregate func on the subset of data.
        """

        def _func(data):
            d = data.loc[data.index.map(
                lambda x: x.hour < 11)].dropna()
            if fix:
                data[data.index[0]]
            if len(d) == 0:
                return None
            return func(d)

        return _func

    def afunc(data):
        d = data.select(lambda x: x.hour < 11).dropna()
        return np.max(d)

    grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
    closure_bad = grouped.agg({'high': agg_before(11, np.max)})
    closure_good = grouped.agg({'high': agg_before(11, np.max, True)})

    assert_frame_equal(closure_bad, closure_good)


def test_groupby_multiindex_missing_pair():
    # GH9049
    df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
                    'group2': ['c', 'c', 'd', 'c'],
                    'value': [1, 1, 1, 5]})
    df = df.set_index(['group1', 'group2'])
    df_grouped = df.groupby(level=['group1', 'group2'], sort=True)

    res = df_grouped.agg('sum')
    idx = MultiIndex.from_tuples(
        [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
    exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])

    tm.assert_frame_equal(res, exp)


def test_groupby_multiindex_not_lexsorted():
    # GH 11640

    # define the lexsorted version
    lexsorted_mi = MultiIndex.from_tuples(
        [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
    lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
    assert lexsorted_df.columns.is_lexsorted()

    # define the non-lexsorted version
    not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                 data=[[1, 'b1', 'c1', 3],
                                       [1, 'b2', 'c2', 4]])
    not_lexsorted_df = not_lexsorted_df.pivot_table(
        index='a', columns=['b', 'c'], values='d')
    not_lexsorted_df = not_lexsorted_df.reset_index()
    assert not not_lexsorted_df.columns.is_lexsorted()

    # compare the results
    tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

    expected = lexsorted_df.groupby('a').mean()
    with tm.assert_produces_warning(PerformanceWarning):
        result = not_lexsorted_df.groupby('a').mean()
    tm.assert_frame_equal(expected, result)

    # a transforming function should work regardless of sort
    # GH 14776
    df = DataFrame({'x': ['a', 'a', 'b', 'a'],
                    'y': [1, 1, 2, 2],
                    'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
    assert not df.index.is_lexsorted()

    for level in [0, 1, [0, 1]]:
        for sort in [False, True]:
            result = df.groupby(level=level, sort=sort).apply(
                DataFrame.drop_duplicates)
            expected = df
            tm.assert_frame_equal(expected, result)

            result = df.sort_index().groupby(level=level, sort=sort).apply(
                DataFrame.drop_duplicates)
            expected = df.sort_index()
            tm.assert_frame_equal(expected, result)


def test_index_label_overlaps_location():
    # checking we don't have any label/location confusion in the
    # the wake of GH5375
    df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
    g = df.groupby(list('ababb'))
    actual = g.filter(lambda x: len(x) > 2)
    expected = df.iloc[[1, 3, 4]]
    assert_frame_equal(actual, expected)

    ser = df[0]
    g = ser.groupby(list('ababb'))
    actual = g.filter(lambda x: len(x) > 2)
    expected = ser.take([1, 3, 4])
    assert_series_equal(actual, expected)

    # ... and again, with a generic Index of floats
    df.index = df.index.astype(float)
    g = df.groupby(list('ababb'))
    actual = g.filter(lambda x: len(x) > 2)
    expected = df.iloc[[1, 3, 4]]
    assert_frame_equal(actual, expected)

    ser = df[0]
    g = ser.groupby(list('ababb'))
    actual = g.filter(lambda x: len(x) > 2)
    expected = ser.take([1, 3, 4])
    assert_series_equal(actual, expected)


def test_transform_doesnt_clobber_ints():
    # GH 7972
    n = 6
    x = np.arange(n)
    df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
    df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})

    gb = df.groupby('a')
    result = gb.transform('mean')

    gb2 = df2.groupby('a')
    expected = gb2.transform('mean')
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings',
                                         ['ints', 'floats'],
                                         ['ints', 'strings']])
@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups',
                                          ['int_groups', 'string_groups']])
def test_groupby_preserves_sort(sort_column, group_column):
    # Test to ensure that groupby always preserves sort order of original
    # object. Issue #8588 and #9651

    df = DataFrame(
        {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
         'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
         'ints': [8, 7, 4, 5, 2, 9, 1, 1],
         'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
         'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})

    # Try sorting on different types and with different group types

    df = df.sort_values(by=sort_column)
    g = df.groupby(group_column)

    def test_sort(x):
        assert_frame_equal(x, x.sort_values(by=sort_column))
    g.apply(test_sort)


def test_group_shift_with_null_key():
    # This test is designed to replicate the segfault in issue #13813.
    n_rows = 1200

    # Generate a moderately large dataframe with occasional missing
    # values in column `B`, and then group by [`A`, `B`]. This should
    # force `-1` in `labels` array of `g.grouper.group_info` exactly
    # at those places, where the group-by key is partially missing.
    df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
                    for i in range(n_rows)], dtype=float,
                   columns=["A", "B", "Z"], index=None)
    g = df.groupby(["A", "B"])

    expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
                           else np.nan)
                          for i in range(n_rows)], dtype=float,
                         columns=["Z"], index=None)
    result = g.shift(-1)

    assert_frame_equal(result, expected)


def test_group_shift_with_fill_value():
    # GH #24128
    n_rows = 24
    df = DataFrame([(i % 12, i % 3, i)
                    for i in range(n_rows)], dtype=float,
                   columns=["A", "B", "Z"], index=None)
    g = df.groupby(["A", "B"])

    expected = DataFrame([(i + 12 if i < n_rows - 12
                           else 0)
                          for i in range(n_rows)], dtype=float,
                         columns=["Z"], index=None)
    result = g.shift(-1, fill_value=0)[["Z"]]

    assert_frame_equal(result, expected)


def test_pivot_table_values_key_error():
    # This test is designed to replicate the error in issue #14938
    df = pd.DataFrame({'eventDate':
                       pd.date_range(pd.datetime.today(),
                                     periods=20, freq='M').tolist(),
                       'thename': range(0, 20)})

    df['year'] = df.set_index('eventDate').index.year
    df['month'] = df.set_index('eventDate').index.month

    with pytest.raises(KeyError, match="'badname'"):
        df.reset_index().pivot_table(index='year', columns='month',
                                     values='badname', aggfunc='count')


def test_empty_dataframe_groupby():
    # GH8093
    df = DataFrame(columns=['A', 'B', 'C'])

    result = df.groupby('A').sum()
    expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
    expected.index.name = 'A'

    assert_frame_equal(result, expected)


def test_tuple_warns():
    # https://github.com/pandas-dev/pandas/issues/18314
    df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
                       'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
    with tm.assert_produces_warning(FutureWarning) as w:
        df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()

    assert "Interpreting tuple 'by' as a list" in str(w[0].message)

    with tm.assert_produces_warning(None):
        df.groupby(('a', 'b')).c.mean()


def test_tuple_warns_unhashable():
    # https://github.com/pandas-dev/pandas/issues/18314
    business_dates = date_range(start='4/1/2014', end='6/30/2014',
                                freq='B')
    df = DataFrame(1, index=business_dates, columns=['a', 'b'])

    with tm.assert_produces_warning(FutureWarning) as w:
        df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

    assert "Interpreting tuple 'by' as a list" in str(w[0].message)


def test_tuple_correct_keyerror():
    # https://github.com/pandas-dev/pandas/issues/18798
    df = pd.DataFrame(1, index=range(3),
                      columns=pd.MultiIndex.from_product([[1, 2],
                                                          [3, 4]]))
    with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
        df.groupby((7, 8)).mean()


def test_groupby_agg_ohlc_non_first():
    # GH 21716
    df = pd.DataFrame([[1], [1]], columns=['foo'],
                      index=pd.date_range('2018-01-01', periods=2, freq='D'))

    expected = pd.DataFrame([
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]
    ], columns=pd.MultiIndex.from_tuples((
        ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'),
        ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'),
        ('foo', 'sum', 'foo'))), index=pd.date_range(
            '2018-01-01', periods=2, freq='D'))

    result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc'])

    tm.assert_frame_equal(result, expected)
