Skip to content

Commit

Permalink
updated ordinal info function
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonMolinsky committed Sep 19, 2023
1 parent 17448c1 commit 7576545
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 49 deletions.
49 changes: 0 additions & 49 deletions ord_info_function.py

This file was deleted.

53 changes: 53 additions & 0 deletions src/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Sequence, Union

from numpy import ndarray
import pandas as pd
from pandas import Index, Series
from pandas.core.arrays import ExtensionArray


def info(ds: Sequence,
indicators: Union[ExtensionArray, ndarray, Index, Series, Sequence, list, range, None]) -> pd.DataFrame:
"""
Get basic summary statistics.
Parameters
----------
ds : Sequence
The collection of answers or grades.
indicators : Ordered Sequence
The ordered collection of unique answers or grades.
Returns
-------
summary_df : pandas DataFrame
Summary statistics for an ordinal dataset.
"""

# Calculate frequency using pandas value_counts()
freq_counts = pd.value_counts(ds)
freq_counts.name = 'frequency'
freq_counts_df = pd.DataFrame(freq_counts)
freq_counts_df.index.name = 'indicator'

# Calculate ratio
total_responses = len(ds)
freq_counts_df['ratio'] = (freq_counts_df['frequency'] / total_responses) * 100

# Add all indicators
summary_df = pd.DataFrame(index=indicators, columns=['frequency', 'ratio'])
summary_df.index.name = 'indicator'

# Merge the frequency and percent DataFrames to include 0 counts for missing indicators
summary_df.update(freq_counts_df, join="left")
summary_df.fillna(0, inplace=True)

# Calculate cumulative percent
summary_df['cumulative'] = summary_df['ratio'].cumsum()

# Set the last cumulative value to 100
summary_df.loc[summary_df['cumulative'] > 100, 'cumulative'] = 100

# Create the final DataFrame
return summary_df
50 changes: 50 additions & 0 deletions unit_tests/test_02_ordinal_info_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
import pandas as pd

from src.core import info


INDICATORS = ['not satisfied', 'ok', 'excellent', 'test']

DS = []
DS.extend(3 * ['not satisfied'])
DS.extend(10 * ['ok'])
DS.extend(35 * ['excellent'])
RESULT = info(DS, INDICATORS)


def test_dtype():
assert isinstance(RESULT, pd.DataFrame)


def test_index_ordering():
for idx, x in enumerate(RESULT.index.values):
assert x == INDICATORS[idx]


def test_columns():
expected_columns = {'frequency', 'ratio', 'cumulative'}
assert expected_columns == set(RESULT.columns)


def test_ratio():
ratios = RESULT['ratio'].values
assert np.alltrue(ratios >= 0)
assert np.alltrue(ratios <= 100)


def test_cumulative():
cumsum = RESULT['cumulative'].values
previous_val = 0
for x in cumsum:
assert x >= 0
assert x <= 100
assert x >= previous_val
previous_val = x


def test_range():
values = np.random.randint(1, 6, 200)
indicators = range(1, 6, 1)
range_result = info(values, indicators)
assert not range_result.isna().any().any()

0 comments on commit 7576545

Please sign in to comment.