From 498338ebe4a66cc05fb2cf78a497e5908c66c142 Mon Sep 17 00:00:00 2001 From: Mani Sarkar Date: Sat, 15 May 2021 00:50:01 +0100 Subject: [PATCH] Granular features: linking syllables count feature to pipeline, and updating the acceptance tests for the syllables count feature --- nlp_profiler/constants.py | 1 + nlp_profiler/granular_features/__init__.py | 7 ++-- .../data/expected_profiled_dataframe.csv | 32 +++++++++---------- ...ected_profiled_dataframe_no_high_level.csv | 32 +++++++++---------- 4 files changed, 38 insertions(+), 34 deletions(-) diff --git a/nlp_profiler/constants.py b/nlp_profiler/constants.py index 29a4602..12c5ea3 100644 --- a/nlp_profiler/constants.py +++ b/nlp_profiler/constants.py @@ -59,5 +59,6 @@ CHARACTERS_COUNT_COL = 'characters_count' ENGLISH_CHARACTERS_COUNT_COL = 'english_characters_count' NON_ENGLISH_CHARACTERS_COUNT_COL = 'non_english_characters_count' +SYLLABLES_COUNT_COL = 'syllables_count' SENTENCES_COUNT_COL = 'sentences_count' NOUN_PHASE_COUNT_COL = 'noun_phase_count' diff --git a/nlp_profiler/granular_features/__init__.py b/nlp_profiler/granular_features/__init__.py index 1d48088..3c1263c 100644 --- a/nlp_profiler/granular_features/__init__.py +++ b/nlp_profiler/granular_features/__init__.py @@ -11,7 +11,8 @@ REPEATED_SPACES_COUNT_COL, WHITESPACES_COUNT_COL, CHARS_EXCL_WHITESPACES_COUNT_COL, \ REPEATED_WHITESPACES_COUNT_COL, ALPHA_NUMERIC_COUNT_COL, REPEATED_LETTERS_COUNT_COL, \ WHOLE_NUMBERS_COUNT_COL, REPEATED_DIGITS_COUNT_COL, EMOJI_COUNT_COL, \ - NOUN_PHASE_COUNT_COL, ENGLISH_CHARACTERS_COUNT_COL, NON_ENGLISH_CHARACTERS_COUNT_COL + NOUN_PHASE_COUNT_COL, ENGLISH_CHARACTERS_COUNT_COL, NON_ENGLISH_CHARACTERS_COUNT_COL, \ + SYLLABLES_COUNT_COL from nlp_profiler.generate_features import generate_features from nlp_profiler.granular_features.alphanumeric import count_alpha_numeric from nlp_profiler.granular_features.chars_spaces_and_whitespaces \ @@ -28,6 +29,7 @@ from nlp_profiler.granular_features.letters import count_repeated_letters from nlp_profiler.granular_features.non_alphanumeric import count_non_alpha_numeric from nlp_profiler.granular_features.noun_phase_count import count_noun_phase +from nlp_profiler.granular_features.syllables import count_syllables from nlp_profiler.granular_features.numbers import count_whole_numbers, count_repeated_digits from nlp_profiler.granular_features.punctuations import count_punctuations, count_repeated_punctuations from nlp_profiler.granular_features.sentences import count_sentences @@ -62,7 +64,8 @@ def apply_granular_features(heading: str, (DATES_COUNT_COL, text_column, count_dates), (NOUN_PHASE_COUNT_COL, text_column, count_noun_phase), (ENGLISH_CHARACTERS_COUNT_COL, text_column, count_english_chars), - (NON_ENGLISH_CHARACTERS_COUNT_COL, text_column, count_non_english_chars) + (NON_ENGLISH_CHARACTERS_COUNT_COL, text_column, count_non_english_chars), + (SYLLABLES_COUNT_COL, text_column, count_syllables) ] generate_features( heading, steps_for_features, diff --git a/tests/acceptance_tests/data/expected_profiled_dataframe.csv b/tests/acceptance_tests/data/expected_profiled_dataframe.csv index 393f1a5..ff26577 100644 --- a/tests/acceptance_tests/data/expected_profiled_dataframe.csv +++ b/tests/acceptance_tests/data/expected_profiled_dataframe.csv @@ -1,23 +1,23 @@ -text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count,sentiment_polarity_score,sentiment_polarity,sentiment_polarity_summarised,sentiment_subjectivity_score,sentiment_subjectivity,sentiment_subjectivity_summarised,spelling_quality_score,spelling_quality,spelling_quality_summarised,ease_of_reading_score,ease_of_reading_quality,ease_of_reading_summarised -I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2,0.38,Pretty positive,Positive,0.43,Objective/subjective,Objective/subjective,1.0,Very good,Good,116.15,Very Easy,Easy -2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0,-0.10681818181818181,Pretty negative,Negative,0.55,Objective/subjective,Objective/subjective,1.0,Very good,Good,107.69,Very Easy,Easy -2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,105.66,Very Easy,Easy -Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0,-0.2916666666666667,Pretty negative,Negative,0.5416666666666666,Objective/subjective,Objective/subjective,0.6923076923076923,Bad,Bad,91.27,Very Easy,Easy -283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.8571428571428572,Bad,Bad,106.67,Very Easy,Easy -"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0,0.375,Pretty positive,Positive,0.75,Pretty subjective,Subjective,0.9444444444444444,Pretty good,Good,66.74,Standard,Standard -283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.95,Good,Good,89.75,Easy,Easy -"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy -The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy -Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0,0.03888888888888886,Pretty positive,Positive,0.5611111111111111,Objective/subjective,Objective/subjective,1.0,Very good,Good,100.24,Very Easy,Easy -283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,56.93,Fairly Difficult,Difficult +text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count,syllables_count,sentiment_polarity_score,sentiment_polarity,sentiment_polarity_summarised,sentiment_subjectivity_score,sentiment_subjectivity,sentiment_subjectivity_summarised,spelling_quality_score,spelling_quality,spelling_quality_summarised,ease_of_reading_score,ease_of_reading_quality,ease_of_reading_summarised +I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2,6,0.38,Pretty positive,Positive,0.43,Objective/subjective,Objective/subjective,1.0,Very good,Good,116.15,Very Easy,Easy +2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0,13,-0.10681818181818181,Pretty negative,Negative,0.55,Objective/subjective,Objective/subjective,1.0,Very good,Good,107.69,Very Easy,Easy +2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0,9,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,105.66,Very Easy,Easy +Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0,14,-0.2916666666666667,Pretty negative,Negative,0.5416666666666666,Objective/subjective,Objective/subjective,0.6923076923076923,Bad,Bad,91.27,Very Easy,Easy +283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0,8,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.8571428571428572,Bad,Bad,106.67,Very Easy,Easy +"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0,19,0.375,Pretty positive,Positive,0.75,Pretty subjective,Subjective,0.9444444444444444,Pretty good,Good,66.74,Standard,Standard +283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0,9,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,0.95,Good,Good,89.75,Easy,Easy +"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0,13,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy +The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0,13,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.75,Bad,Bad,86.71,Easy,Easy +Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0,18,0.03888888888888886,Pretty positive,Positive,0.5611111111111111,Objective/subjective,Objective/subjective,1.0,Very good,Good,100.24,Very Easy,Easy +283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0,10,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,56.93,Fairly Difficult,Difficult "2833047 pe -ople li ve i n this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8888888888888888,Bad,Bad,130.02,Very Easy,Easy +ople li ve i n this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0,7,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8888888888888888,Bad,Bad,130.02,Very Easy,Easy "2833047 people live in th -is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,63.36,Standard,Standard +is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0,13,0.13636363636363635,Pretty positive,Positive,0.5,Objective/subjective,Objective/subjective,1.0,Very good,Good,63.36,Standard,Standard "±§£ABCDEabcdef0123456789 is are -!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8928571428571429,Bad,Bad,93.81,Very Easy,Easy +!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1,4,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.8928571428571429,Bad,Bad,93.81,Very Easy,Easy "2833047 pe -ople li ve i n this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.7435897435897436,Bad,Bad,69.45,Standard,Standard +ople li ve i n this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155,34,0.0,Neutral,Neutral,0.0,Very objective,Objective,0.7435897435897436,Bad,Bad,69.45,Standard,Standard diff --git a/tests/acceptance_tests/data/expected_profiled_dataframe_no_high_level.csv b/tests/acceptance_tests/data/expected_profiled_dataframe_no_high_level.csv index 13d7b4b..0d5f53d 100644 --- a/tests/acceptance_tests/data/expected_profiled_dataframe_no_high_level.csv +++ b/tests/acceptance_tests/data/expected_profiled_dataframe_no_high_level.csv @@ -1,23 +1,23 @@ -text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count -I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2 -2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0 -2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0 -Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0 -283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0 -"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0 -283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0 -"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0 -The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0 -Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0 -283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0 +text,sentences_count,characters_count,repeated_letters_count,spaces_count,chars_excl_spaces_count,repeated_spaces_count,whitespaces_count,chars_excl_whitespaces_count,repeated_whitespaces_count,count_words,duplicates_count,emoji_count,repeated_digits_count,whole_numbers_count,alpha_numeric_count,non_alpha_numeric_count,punctuations_count,repeated_punctuations_count,stop_words_count,dates_count,noun_phase_count,english_characters_count,non_english_characters_count,syllables_count +I love ⚽ very much 😁.,1,21,0,5,16,0,5,16,0,4,0,2,0,0,13,8,1,0,1,0,3,19,2,6 +2833047 people live in this area. It is not a good area.,2,56,0,11,45,0,11,45,0,11,2,0,1,1,43,13,2,0,5,0,4,56,0,13 +2833047 and 1111 people live in this area.,1,42,0,7,35,0,7,35,0,6,0,0,2,2,34,8,1,0,3,0,2,42,0,9 +Harrington PPPPPPpppppeople work hard. I think they have a goodd traittttt.,2,75,3,10,65,0,10,65,0,11,1,0,0,0,63,12,2,0,3,0,5,75,0,14 +283047 people live in this area3333 22224444,1,44,0,6,38,0,6,38,0,4,0,0,3,3,38,6,0,0,2,0,2,44,0,8 +"This sentence does not seem to have too many commas, periods or semicolons (;).",1,79,0,13,66,0,13,66,0,13,0,0,0,0,61,18,5,0,6,0,5,79,0,19 +283047 people live in this area[[[ ]]] :::;;;;££,1,48,0,7,41,0,7,41,0,5,4,1,0,1,26,22,15,5,2,0,7,48,0,9 +"The date today is 04/28/2020 for format mm/dd/yyyy, not 28/04/2020.",1,67,1,9,58,0,9,58,0,10,0,0,0,6,50,17,8,0,3,1,4,67,0,13 +The date today is 28/04/2020 and tomorrow's date is 29/04/2020.,1,63,0,9,54,0,9,54,0,9,2,0,0,6,48,15,6,0,3,2,4,63,0,13 +Everyone here works so hard. People work hard. I think they have a good trait.,3,78,0,14,64,0,14,64,0,15,2,0,0,0,61,17,3,0,5,0,5,78,0,18 +283047 people live in this area,1,34,0,8,26,2,8,26,2,5,0,0,0,1,26,8,0,0,2,0,2,34,0,10 "2833047 pe -ople li ve i n this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0 +ople li ve i n this area",1,35,0,5,30,0,8,27,0,8,0,0,1,1,27,8,0,0,3,0,6,35,0,7 "2833047 people live in th -is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0 +is are a",1,45,0,11,34,3,18,27,6,7,0,0,1,1,27,18,0,0,4,0,2,45,0,13 "±§£ABCDEabcdef0123456789 is are -!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1 +!#$%&()*+-./:;<=>?@[\]^_`{|}~",2,62,0,1,61,0,4,58,0,3,0,0,0,1,26,36,31,0,2,0,11,61,1,4 "2833047 pe -ople li ve i n this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155 +ople li ve i n this area‚ƒ„…†‡ˆ‰Š‹ŒŽ•™š›œžŸ¡¢¤¥¦¨©ª«¬­®¯°²³´""µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿThis sentence is in japanese (kana) ろぬふうえおやゆゆわほへへてThis sentence is in japanese (kana compact) おっあこおおがおんわおفصصصشببلااتنخمككك This sentence is in arabic",1,298,0,23,275,0,26,272,0,42,8,3,1,1,112,186,5,0,9,0,20,143,155,34